Skip to content

Bourdieu API

A class for performing Bourdieu analysis on a collection of documents.

This class leverages an embedding model to compute Bourdieu dimensions and topics for the given documents. It supports customization of the analysis through various parameters and the use of generative AI for topic naming.

Source code in bunkatopics/bourdieu/bourdieu_api.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class BourdieuAPI:
    """
    A class for performing Bourdieu analysis on a collection of documents.

    This class leverages an embedding model to compute Bourdieu dimensions and topics
    for the given documents. It supports customization of the analysis through various parameters
    and the use of generative AI for topic naming.

    """

    def __init__(
        self,
        embedding_model: Embeddings,
        llm: t.Optional[LLM] = None,
        bourdieu_query: BourdieuQuery = BourdieuQuery(),
        topic_param: TopicParam = TopicParam(),
        topic_gen_param: TopicGenParam = TopicGenParam(),
        min_count_terms: int = 2,
        ranking_terms: int = 20,
        min_docs_per_cluster: int = 20,
    ) -> None:
        """
        Initializes the BourdieuAPI with the provided models, parameters, and configurations.

        Args:
            llm: The generative AI model for topic naming.
            embedding_model: The model used for embedding documents.
            bourdieu_query (BourdieuQuery, optional): Configuration for Bourdieu analysis.
                                                       Defaults to BourdieuQuery().
            topic_param (TopicParam, optional): Parameters for topic modeling. Defaults to TopicParam().
            topic_gen_param (TopicGenParam, optional): Parameters for the generative AI in topic naming.
                                                       Defaults to TopicGenParam().
            min_count_terms (int, optional): Minimum term count for topic modeling. Defaults to 2.
        """

        self.llm = llm
        self.embedding_model = embedding_model
        self.bourdieu_query = bourdieu_query
        self.topic_param = topic_param
        self.topic_gen_param = topic_gen_param
        self.min_count_terms = min_count_terms
        self.ranking_terms = ranking_terms
        self.min_docs_per_cluster = min_docs_per_cluster

    def fit_transform(
        self, docs: t.List[Document], terms: t.List[Term]
    ) -> t.Tuple[t.List[Document], t.List[Topic]]:
        """
        Processes the documents and terms to compute Bourdieu dimensions and topics.

        This method applies the embedding model to compute Bourdieu dimensions for each document
        based on provided queries. It also performs topic modeling on the documents and, if enabled,
        uses a generative AI model for naming the topics.

        Arguments:
            docs (List[Document]): List of Document objects representing the documents to be analyzed.
            terms (List[Term]): List of Term objects representing the terms to be used in topic modeling.

        Notes:
            - The method first resets Bourdieu dimensions for all documents.
            - It computes Bourdieu continuums based on the configured left and right words.
            - Documents are then filtered based on their position relative to a defined radius in the Bourdieu space.
            - Topic modeling is performed on the filtered set of documents.
            - If `generative_ai_name` is True, topics are named using the generative AI model.
        """

        # Reset Bourdieu dimensions for all documents
        for doc in docs:
            doc.bourdieu_dimensions = []

        # Compute Continuums
        new_docs = _get_continuum(
            self.embedding_model,
            docs,
            cont_name="cont1",
            left_words=self.bourdieu_query.x_left_words,
            right_words=self.bourdieu_query.x_right_words,
        )
        bourdieu_docs = _get_continuum(
            self.embedding_model,
            new_docs,
            cont_name="cont2",
            left_words=self.bourdieu_query.y_top_words,
            right_words=self.bourdieu_query.y_bottom_words,
        )

        # Process and transform data
        df_bourdieu = pd.DataFrame(
            [
                {
                    "doc_id": x.doc_id,
                    "coordinates": [y.distance for y in x.bourdieu_dimensions],
                    "names": [y.continuum.id for y in x.bourdieu_dimensions],
                }
                for x in bourdieu_docs
            ]
        )
        df_bourdieu = df_bourdieu.explode(["coordinates", "names"])

        df_bourdieu_pivot = df_bourdieu[["doc_id", "coordinates", "names"]]
        df_bourdieu_pivot = df_bourdieu_pivot.pivot(
            index="doc_id", columns="names", values="coordinates"
        )

        # Add to the bourdieu_docs
        df_outsides = df_bourdieu_pivot.reset_index()
        df_outsides["cont1"] = df_outsides["cont1"].astype(float)
        df_outsides["cont2"] = df_outsides["cont2"].astype(float)

        x_values = df_outsides["cont1"].values
        y_values = df_outsides["cont2"].values

        distances = np.sqrt(x_values**2 + y_values**2)
        circle_radius = max(df_outsides.cont1) * self.bourdieu_query.radius_size

        df_outsides["distances"] = distances
        df_outsides["outside"] = "0"
        df_outsides.loc[df_outsides["distances"] >= circle_radius, "outside"] = "1"

        outside_ids = list(df_outsides["doc_id"][df_outsides["outside"] == "1"])
        bourdieu_docs = [x for x in bourdieu_docs if x.doc_id in outside_ids]
        bourdieu_dict = df_bourdieu_pivot.to_dict(orient="index")

        for doc in bourdieu_docs:
            doc.x = bourdieu_dict.get(doc.doc_id)["cont1"]
            doc.y = bourdieu_dict.get(doc.doc_id)["cont2"]

        # Compute Bourdieu topics
        topic_model = BunkaTopicModeling(
            n_clusters=self.topic_param.n_clusters,
            ngrams=self.topic_param.ngrams,
            name_length=self.topic_param.name_length,
            top_terms_overall=self.topic_param.top_terms_overall,
            min_count_terms=self.min_count_terms,
            min_docs_per_cluster=self.min_docs_per_cluster,
        )

        bourdieu_topics: t.List[Topic] = topic_model.fit_transform(
            docs=bourdieu_docs,
            terms=terms,
        )
        model_ranker = DocumentRanker(ranking_terms=self.ranking_terms)
        bourdieu_docs, bourdieu_topics = model_ranker.fit_transform(
            bourdieu_docs, bourdieu_topics
        )

        if self.llm:
            model_cleaning = LLMCleaningTopic(
                self.llm,
                language=self.topic_gen_param.language,
                use_doc=self.topic_gen_param.use_doc,
                context=self.topic_gen_param.context,
            )
            bourdieu_topics: t.List[Topic] = model_cleaning.fit_transform(
                bourdieu_topics,
                bourdieu_docs,
            )

        return bourdieu_docs, bourdieu_topics

__init__(embedding_model, llm=None, bourdieu_query=BourdieuQuery(), topic_param=TopicParam(), topic_gen_param=TopicGenParam(), min_count_terms=2, ranking_terms=20, min_docs_per_cluster=20)

Initializes the BourdieuAPI with the provided models, parameters, and configurations.

Parameters:

Name Type Description Default
llm Optional[LLM]

The generative AI model for topic naming.

None
embedding_model Embeddings

The model used for embedding documents.

required
bourdieu_query BourdieuQuery

Configuration for Bourdieu analysis. Defaults to BourdieuQuery().

BourdieuQuery()
topic_param TopicParam

Parameters for topic modeling. Defaults to TopicParam().

TopicParam()
topic_gen_param TopicGenParam

Parameters for the generative AI in topic naming. Defaults to TopicGenParam().

TopicGenParam()
min_count_terms int

Minimum term count for topic modeling. Defaults to 2.

2
Source code in bunkatopics/bourdieu/bourdieu_api.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def __init__(
    self,
    embedding_model: Embeddings,
    llm: t.Optional[LLM] = None,
    bourdieu_query: BourdieuQuery = BourdieuQuery(),
    topic_param: TopicParam = TopicParam(),
    topic_gen_param: TopicGenParam = TopicGenParam(),
    min_count_terms: int = 2,
    ranking_terms: int = 20,
    min_docs_per_cluster: int = 20,
) -> None:
    """
    Initializes the BourdieuAPI with the provided models, parameters, and configurations.

    Args:
        llm: The generative AI model for topic naming.
        embedding_model: The model used for embedding documents.
        bourdieu_query (BourdieuQuery, optional): Configuration for Bourdieu analysis.
                                                   Defaults to BourdieuQuery().
        topic_param (TopicParam, optional): Parameters for topic modeling. Defaults to TopicParam().
        topic_gen_param (TopicGenParam, optional): Parameters for the generative AI in topic naming.
                                                   Defaults to TopicGenParam().
        min_count_terms (int, optional): Minimum term count for topic modeling. Defaults to 2.
    """

    self.llm = llm
    self.embedding_model = embedding_model
    self.bourdieu_query = bourdieu_query
    self.topic_param = topic_param
    self.topic_gen_param = topic_gen_param
    self.min_count_terms = min_count_terms
    self.ranking_terms = ranking_terms
    self.min_docs_per_cluster = min_docs_per_cluster

fit_transform(docs, terms)

Processes the documents and terms to compute Bourdieu dimensions and topics.

This method applies the embedding model to compute Bourdieu dimensions for each document based on provided queries. It also performs topic modeling on the documents and, if enabled, uses a generative AI model for naming the topics.

Parameters:

Name Type Description Default
docs List[Document]

List of Document objects representing the documents to be analyzed.

required
terms List[Term]

List of Term objects representing the terms to be used in topic modeling.

required
Notes
  • The method first resets Bourdieu dimensions for all documents.
  • It computes Bourdieu continuums based on the configured left and right words.
  • Documents are then filtered based on their position relative to a defined radius in the Bourdieu space.
  • Topic modeling is performed on the filtered set of documents.
  • If generative_ai_name is True, topics are named using the generative AI model.
Source code in bunkatopics/bourdieu/bourdieu_api.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def fit_transform(
    self, docs: t.List[Document], terms: t.List[Term]
) -> t.Tuple[t.List[Document], t.List[Topic]]:
    """
    Processes the documents and terms to compute Bourdieu dimensions and topics.

    This method applies the embedding model to compute Bourdieu dimensions for each document
    based on provided queries. It also performs topic modeling on the documents and, if enabled,
    uses a generative AI model for naming the topics.

    Arguments:
        docs (List[Document]): List of Document objects representing the documents to be analyzed.
        terms (List[Term]): List of Term objects representing the terms to be used in topic modeling.

    Notes:
        - The method first resets Bourdieu dimensions for all documents.
        - It computes Bourdieu continuums based on the configured left and right words.
        - Documents are then filtered based on their position relative to a defined radius in the Bourdieu space.
        - Topic modeling is performed on the filtered set of documents.
        - If `generative_ai_name` is True, topics are named using the generative AI model.
    """

    # Reset Bourdieu dimensions for all documents
    for doc in docs:
        doc.bourdieu_dimensions = []

    # Compute Continuums
    new_docs = _get_continuum(
        self.embedding_model,
        docs,
        cont_name="cont1",
        left_words=self.bourdieu_query.x_left_words,
        right_words=self.bourdieu_query.x_right_words,
    )
    bourdieu_docs = _get_continuum(
        self.embedding_model,
        new_docs,
        cont_name="cont2",
        left_words=self.bourdieu_query.y_top_words,
        right_words=self.bourdieu_query.y_bottom_words,
    )

    # Process and transform data
    df_bourdieu = pd.DataFrame(
        [
            {
                "doc_id": x.doc_id,
                "coordinates": [y.distance for y in x.bourdieu_dimensions],
                "names": [y.continuum.id for y in x.bourdieu_dimensions],
            }
            for x in bourdieu_docs
        ]
    )
    df_bourdieu = df_bourdieu.explode(["coordinates", "names"])

    df_bourdieu_pivot = df_bourdieu[["doc_id", "coordinates", "names"]]
    df_bourdieu_pivot = df_bourdieu_pivot.pivot(
        index="doc_id", columns="names", values="coordinates"
    )

    # Add to the bourdieu_docs
    df_outsides = df_bourdieu_pivot.reset_index()
    df_outsides["cont1"] = df_outsides["cont1"].astype(float)
    df_outsides["cont2"] = df_outsides["cont2"].astype(float)

    x_values = df_outsides["cont1"].values
    y_values = df_outsides["cont2"].values

    distances = np.sqrt(x_values**2 + y_values**2)
    circle_radius = max(df_outsides.cont1) * self.bourdieu_query.radius_size

    df_outsides["distances"] = distances
    df_outsides["outside"] = "0"
    df_outsides.loc[df_outsides["distances"] >= circle_radius, "outside"] = "1"

    outside_ids = list(df_outsides["doc_id"][df_outsides["outside"] == "1"])
    bourdieu_docs = [x for x in bourdieu_docs if x.doc_id in outside_ids]
    bourdieu_dict = df_bourdieu_pivot.to_dict(orient="index")

    for doc in bourdieu_docs:
        doc.x = bourdieu_dict.get(doc.doc_id)["cont1"]
        doc.y = bourdieu_dict.get(doc.doc_id)["cont2"]

    # Compute Bourdieu topics
    topic_model = BunkaTopicModeling(
        n_clusters=self.topic_param.n_clusters,
        ngrams=self.topic_param.ngrams,
        name_length=self.topic_param.name_length,
        top_terms_overall=self.topic_param.top_terms_overall,
        min_count_terms=self.min_count_terms,
        min_docs_per_cluster=self.min_docs_per_cluster,
    )

    bourdieu_topics: t.List[Topic] = topic_model.fit_transform(
        docs=bourdieu_docs,
        terms=terms,
    )
    model_ranker = DocumentRanker(ranking_terms=self.ranking_terms)
    bourdieu_docs, bourdieu_topics = model_ranker.fit_transform(
        bourdieu_docs, bourdieu_topics
    )

    if self.llm:
        model_cleaning = LLMCleaningTopic(
            self.llm,
            language=self.topic_gen_param.language,
            use_doc=self.topic_gen_param.use_doc,
            context=self.topic_gen_param.context,
        )
        bourdieu_topics: t.List[Topic] = model_cleaning.fit_transform(
            bourdieu_topics,
            bourdieu_docs,
        )

    return bourdieu_docs, bourdieu_topics