Skip to content

BunkaTopicModeling

A class to perform topic modeling on a set of documents.

This class utilizes clustering (default KMeans) to identify topics within a collection of documents. Each document and term is represented by embeddings, and topics are formed based on these embeddings. Topics are named using the top terms associated with them.

Source code in bunkatopics/topic_modeling/topic_model_builder.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class BunkaTopicModeling:
    """
    A class to perform topic modeling on a set of documents.

    This class utilizes clustering (default KMeans) to identify topics within a collection of documents.
    Each document and term is represented by embeddings, and topics are formed based on these embeddings.
    Topics are named using the top terms associated with them."""

    def __init__(
        self,
        n_clusters: int = 10,
        ngrams: list = [1, 2],
        name_length: int = 15,
        top_terms_overall: int = 1000,
        min_count_terms: int = 2,
        min_docs_per_cluster: int = 10,
        x_column: str = "x",
        y_column: str = "y",
        custom_clustering_model=None,
    ) -> None:
        """Constructs all the necessary attributes for the BunkaTopicModeling object.

        Arguments:
            n_clusters (int, optional): Number of clusters for K-Means. Defaults to 10.
            ngrams (list, optional): List of n-gram lengths to consider. Defaults to [1, 2].
            name_length (int, optional): Maximum length of topic names. Defaults to 15.
            top_terms_overall (int, optional): Number of top terms to consider overall. Defaults to 1000.
            min_count_terms (int, optional): Minimum count of terms to be considered. Defaults to 2.
            min_docs_per_cluster (int, optional): Minimum count of documents per topic
            x_column (str, optional): Column name for x-coordinate in the DataFrame. Defaults to "x".
            y_column (str, optional): Column name for y-coordinate in the DataFrame. Defaults to "y".
            custom_clustering_model (optional): Custom clustering model instance, if any. Defaults to None.
        """

        self.n_clusters = n_clusters
        self.ngrams = ngrams
        self.name_length = name_length
        self.top_terms_overall = top_terms_overall
        self.min_count_terms = min_count_terms
        self.x_column = x_column
        self.y_column = y_column
        self.custom_clustering_model = custom_clustering_model
        self.min_docs_per_cluster = min_docs_per_cluster

    def fit_transform(
        self,
        docs: t.List[Document],
        terms: t.List[Term],
    ) -> t.List[Topic]:
        """
        Analyzes documents and terms to form topics, assigns names to these topics based on the top terms,
        and returns a list of Topic instances.

        This method performs clustering on the document embeddings to identify distinct topics.
        Each topic is named based on the top terms associated with it. The method also calculates
        additional topic properties such as centroid coordinates and convex hulls.

        Arguments:
            docs (List[[Document]): List of Document objects representing the documents to be analyzed.
            terms (List[Term]): List of Term objects representing the terms to be considered in topic naming.
        Returns:
            List[Topic]: A list of Topic objects, each representing a discovered topic with attributes
                     like name, size, centroid coordinates, and convex hull.

        Notes:
            - If a custom clustering model is not provided, the method defaults to using KMeans for clustering.
            - Topics are named using the most significant terms within each cluster.
            - The method calculates the centroid and convex hull for each topic based on the document embeddings.
        """

        # Rest of the function remains the same...

        x_values = [getattr(doc, self.x_column) for doc in docs]
        y_values = [getattr(doc, self.y_column) for doc in docs]

        # Rest of the function remains unchanged...

        df_embeddings_2D = pd.DataFrame(
            {
                "doc_id": [doc.doc_id for doc in docs],
                self.x_column: x_values,
                self.y_column: y_values,
            }
        )
        df_embeddings_2D = df_embeddings_2D.set_index("doc_id")

        if self.custom_clustering_model is None:
            clustering_model = KMeans(
                n_clusters=self.n_clusters, n_init="auto", random_state=42
            )

        else:
            clustering_model = self.custom_clustering_model

        df_embeddings_2D["topic_number"] = clustering_model.fit(
            df_embeddings_2D
        ).labels_.astype(str)

        df_embeddings_2D["topic_id"] = "bt" + "-" + df_embeddings_2D["topic_number"]

        topic_doc_dict = df_embeddings_2D["topic_id"].to_dict()
        for doc in docs:
            doc.topic_id = topic_doc_dict.get(doc.doc_id, [])

        terms = [x for x in terms if x.count_terms >= self.min_count_terms]

        df_terms = pd.DataFrame.from_records([term.model_dump() for term in terms])
        df_terms = df_terms.sort_values("count_terms", ascending=False)
        df_terms = df_terms.head(self.top_terms_overall)
        df_terms = df_terms[df_terms["ngrams"].isin(self.ngrams)]

        df_terms_indexed = pd.DataFrame.from_records([doc.model_dump() for doc in docs])

        df_terms_indexed = df_terms_indexed[["doc_id", "term_id", "topic_id"]]
        df_terms_indexed = df_terms_indexed.explode("term_id").reset_index(drop=True)

        df_terms_topics = pd.merge(df_terms_indexed, df_terms, on="term_id")

        df_topics_rep = specificity(
            df_terms_topics, X="topic_id", Y="term_id", Z=None, top_n=500
        )
        df_topics_rep = (
            df_topics_rep.groupby("topic_id")["term_id"].apply(list).reset_index()
        )
        df_topics_rep["name"] = df_topics_rep["term_id"].apply(lambda x: x[:100])
        df_topics_rep["name"] = df_topics_rep["name"].apply(lambda x: clean_terms(x))

        df_topics_rep["name"] = df_topics_rep["name"].apply(
            lambda x: x[: self.name_length]
        )
        df_topics_rep["name"] = df_topics_rep["name"].apply(lambda x: " | ".join(x))

        topics = [Topic(**x) for x in df_topics_rep.to_dict(orient="records")]

        df_topics_docs = pd.DataFrame.from_records([doc.model_dump() for doc in docs])
        df_topics_docs = df_topics_docs[["doc_id", "x", "y", "topic_id"]]
        df_topics_docs = df_topics_docs.groupby("topic_id").agg(
            size=("doc_id", "count"), x_centroid=("x", "mean"), y_centroid=("y", "mean")
        )

        topic_dict = df_topics_docs[["size", "x_centroid", "y_centroid"]].to_dict(
            "index"
        )

        for topic in topics:
            topic.size = topic_dict[topic.topic_id]["size"]
            topic.x_centroid = topic_dict[topic.topic_id]["x_centroid"]
            topic.y_centroid = topic_dict[topic.topic_id]["y_centroid"]

        # remove too small clusters
        topics = [x for x in topics if x.size >= self.min_docs_per_cluster]

        try:
            for x in topics:
                topic_id = x.topic_id
                x_points = [doc.x for doc in docs if doc.topic_id == topic_id]
                y_points = [doc.y for doc in docs if doc.topic_id == topic_id]

                points = pd.DataFrame({"x": x_points, "y": y_points}).values

                x_ch, y_ch = get_convex_hull_coord(points, interpolate_curve=True)
                x_ch = list(x_ch)
                y_ch = list(y_ch)

                res = ConvexHullModel(x_coordinates=x_ch, y_coordinates=y_ch)
                x.convex_hull = res
        except Exception as e:
            print(e)

        # Remove in case of HDBSCAN ?

        return topics

__init__(n_clusters=10, ngrams=[1, 2], name_length=15, top_terms_overall=1000, min_count_terms=2, min_docs_per_cluster=10, x_column='x', y_column='y', custom_clustering_model=None)

Constructs all the necessary attributes for the BunkaTopicModeling object.

Parameters:

Name Type Description Default
n_clusters int

Number of clusters for K-Means. Defaults to 10.

10
ngrams list

List of n-gram lengths to consider. Defaults to [1, 2].

[1, 2]
name_length int

Maximum length of topic names. Defaults to 15.

15
top_terms_overall int

Number of top terms to consider overall. Defaults to 1000.

1000
min_count_terms int

Minimum count of terms to be considered. Defaults to 2.

2
min_docs_per_cluster int

Minimum count of documents per topic

10
x_column str

Column name for x-coordinate in the DataFrame. Defaults to "x".

'x'
y_column str

Column name for y-coordinate in the DataFrame. Defaults to "y".

'y'
custom_clustering_model optional

Custom clustering model instance, if any. Defaults to None.

None
Source code in bunkatopics/topic_modeling/topic_model_builder.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(
    self,
    n_clusters: int = 10,
    ngrams: list = [1, 2],
    name_length: int = 15,
    top_terms_overall: int = 1000,
    min_count_terms: int = 2,
    min_docs_per_cluster: int = 10,
    x_column: str = "x",
    y_column: str = "y",
    custom_clustering_model=None,
) -> None:
    """Constructs all the necessary attributes for the BunkaTopicModeling object.

    Arguments:
        n_clusters (int, optional): Number of clusters for K-Means. Defaults to 10.
        ngrams (list, optional): List of n-gram lengths to consider. Defaults to [1, 2].
        name_length (int, optional): Maximum length of topic names. Defaults to 15.
        top_terms_overall (int, optional): Number of top terms to consider overall. Defaults to 1000.
        min_count_terms (int, optional): Minimum count of terms to be considered. Defaults to 2.
        min_docs_per_cluster (int, optional): Minimum count of documents per topic
        x_column (str, optional): Column name for x-coordinate in the DataFrame. Defaults to "x".
        y_column (str, optional): Column name for y-coordinate in the DataFrame. Defaults to "y".
        custom_clustering_model (optional): Custom clustering model instance, if any. Defaults to None.
    """

    self.n_clusters = n_clusters
    self.ngrams = ngrams
    self.name_length = name_length
    self.top_terms_overall = top_terms_overall
    self.min_count_terms = min_count_terms
    self.x_column = x_column
    self.y_column = y_column
    self.custom_clustering_model = custom_clustering_model
    self.min_docs_per_cluster = min_docs_per_cluster

fit_transform(docs, terms)

Analyzes documents and terms to form topics, assigns names to these topics based on the top terms, and returns a list of Topic instances.

This method performs clustering on the document embeddings to identify distinct topics. Each topic is named based on the top terms associated with it. The method also calculates additional topic properties such as centroid coordinates and convex hulls.

Parameters:

Name Type Description Default
docs List[[Document]

List of Document objects representing the documents to be analyzed.

required
terms List[Term]

List of Term objects representing the terms to be considered in topic naming.

required

Returns: List[Topic]: A list of Topic objects, each representing a discovered topic with attributes like name, size, centroid coordinates, and convex hull.

Notes
  • If a custom clustering model is not provided, the method defaults to using KMeans for clustering.
  • Topics are named using the most significant terms within each cluster.
  • The method calculates the centroid and convex hull for each topic based on the document embeddings.
Source code in bunkatopics/topic_modeling/topic_model_builder.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def fit_transform(
    self,
    docs: t.List[Document],
    terms: t.List[Term],
) -> t.List[Topic]:
    """
    Analyzes documents and terms to form topics, assigns names to these topics based on the top terms,
    and returns a list of Topic instances.

    This method performs clustering on the document embeddings to identify distinct topics.
    Each topic is named based on the top terms associated with it. The method also calculates
    additional topic properties such as centroid coordinates and convex hulls.

    Arguments:
        docs (List[[Document]): List of Document objects representing the documents to be analyzed.
        terms (List[Term]): List of Term objects representing the terms to be considered in topic naming.
    Returns:
        List[Topic]: A list of Topic objects, each representing a discovered topic with attributes
                 like name, size, centroid coordinates, and convex hull.

    Notes:
        - If a custom clustering model is not provided, the method defaults to using KMeans for clustering.
        - Topics are named using the most significant terms within each cluster.
        - The method calculates the centroid and convex hull for each topic based on the document embeddings.
    """

    # Rest of the function remains the same...

    x_values = [getattr(doc, self.x_column) for doc in docs]
    y_values = [getattr(doc, self.y_column) for doc in docs]

    # Rest of the function remains unchanged...

    df_embeddings_2D = pd.DataFrame(
        {
            "doc_id": [doc.doc_id for doc in docs],
            self.x_column: x_values,
            self.y_column: y_values,
        }
    )
    df_embeddings_2D = df_embeddings_2D.set_index("doc_id")

    if self.custom_clustering_model is None:
        clustering_model = KMeans(
            n_clusters=self.n_clusters, n_init="auto", random_state=42
        )

    else:
        clustering_model = self.custom_clustering_model

    df_embeddings_2D["topic_number"] = clustering_model.fit(
        df_embeddings_2D
    ).labels_.astype(str)

    df_embeddings_2D["topic_id"] = "bt" + "-" + df_embeddings_2D["topic_number"]

    topic_doc_dict = df_embeddings_2D["topic_id"].to_dict()
    for doc in docs:
        doc.topic_id = topic_doc_dict.get(doc.doc_id, [])

    terms = [x for x in terms if x.count_terms >= self.min_count_terms]

    df_terms = pd.DataFrame.from_records([term.model_dump() for term in terms])
    df_terms = df_terms.sort_values("count_terms", ascending=False)
    df_terms = df_terms.head(self.top_terms_overall)
    df_terms = df_terms[df_terms["ngrams"].isin(self.ngrams)]

    df_terms_indexed = pd.DataFrame.from_records([doc.model_dump() for doc in docs])

    df_terms_indexed = df_terms_indexed[["doc_id", "term_id", "topic_id"]]
    df_terms_indexed = df_terms_indexed.explode("term_id").reset_index(drop=True)

    df_terms_topics = pd.merge(df_terms_indexed, df_terms, on="term_id")

    df_topics_rep = specificity(
        df_terms_topics, X="topic_id", Y="term_id", Z=None, top_n=500
    )
    df_topics_rep = (
        df_topics_rep.groupby("topic_id")["term_id"].apply(list).reset_index()
    )
    df_topics_rep["name"] = df_topics_rep["term_id"].apply(lambda x: x[:100])
    df_topics_rep["name"] = df_topics_rep["name"].apply(lambda x: clean_terms(x))

    df_topics_rep["name"] = df_topics_rep["name"].apply(
        lambda x: x[: self.name_length]
    )
    df_topics_rep["name"] = df_topics_rep["name"].apply(lambda x: " | ".join(x))

    topics = [Topic(**x) for x in df_topics_rep.to_dict(orient="records")]

    df_topics_docs = pd.DataFrame.from_records([doc.model_dump() for doc in docs])
    df_topics_docs = df_topics_docs[["doc_id", "x", "y", "topic_id"]]
    df_topics_docs = df_topics_docs.groupby("topic_id").agg(
        size=("doc_id", "count"), x_centroid=("x", "mean"), y_centroid=("y", "mean")
    )

    topic_dict = df_topics_docs[["size", "x_centroid", "y_centroid"]].to_dict(
        "index"
    )

    for topic in topics:
        topic.size = topic_dict[topic.topic_id]["size"]
        topic.x_centroid = topic_dict[topic.topic_id]["x_centroid"]
        topic.y_centroid = topic_dict[topic.topic_id]["y_centroid"]

    # remove too small clusters
    topics = [x for x in topics if x.size >= self.min_docs_per_cluster]

    try:
        for x in topics:
            topic_id = x.topic_id
            x_points = [doc.x for doc in docs if doc.topic_id == topic_id]
            y_points = [doc.y for doc in docs if doc.topic_id == topic_id]

            points = pd.DataFrame({"x": x_points, "y": y_points}).values

            x_ch, y_ch = get_convex_hull_coord(points, interpolate_curve=True)
            x_ch = list(x_ch)
            y_ch = list(y_ch)

            res = ConvexHullModel(x_coordinates=x_ch, y_coordinates=y_ch)
            x.convex_hull = res
    except Exception as e:
        print(e)

    # Remove in case of HDBSCAN ?

    return topics