Skip to content

Topic Extractor

Source code in bunkatopics/topic_modeling/document_topic_ranker.py
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
class DocumentRanker:
    def __init__(self, ranking_terms: int = 20, max_doc_per_topic: int = 20) -> None:
        """
        Initialize the class with ranking_terms and max_doc_per_topic parameters.

        Args:
            ranking_terms (int): Number of ranking terms to be used.
            max_doc_per_topic (int): Maximum number of documents per topic.

        Returns:
            None
        """
        self.ranking_terms = ranking_terms
        self.max_doc_per_topic = max_doc_per_topic

    def fit_transform(
        self,
        docs: t.List[Document],
        topics: t.List[Topic],
    ) -> t.Tuple[t.List[Document], t.List[Topic]]:
        """
        Calculate top documents for each topic based on ranking terms.

        Args:
            docs (List[Document]): List of documents.
            topics (List[Topic]): List of topics.

        Returns:
            Tuple[List[Document], List[Topic]]: Updated lists of documents and topics.
        """
        # Create a DataFrame from the list of documents
        df_docs = pd.DataFrame.from_records([doc.model_dump() for doc in docs])

        # Explode the term_id column to have one row per term
        df_docs = df_docs[["doc_id", "topic_id", "term_id"]]
        df_docs = df_docs.explode("term_id").reset_index(drop=True)

        # Create a DataFrame from the list of topics and truncate term_id
        df_topics = pd.DataFrame.from_records([topic.model_dump() for topic in topics])
        df_topics["term_id"] = df_topics["term_id"].apply(
            lambda x: x[: self.ranking_terms]
        )
        df_topics = df_topics[["topic_id", "term_id"]]
        df_topics = df_topics.explode("term_id").reset_index(drop=True)

        # Merge documents and topics, and calculate term counts
        df_rank = pd.merge(df_docs, df_topics, on=["topic_id", "term_id"])
        df_rank = (
            df_rank.groupby(["topic_id", "doc_id"])["term_id"]
            .count()
            .rename("count_topic_terms")
            .reset_index()
        )

        # Sort and rank documents within each topic
        df_rank = df_rank.sort_values(
            ["topic_id", "count_topic_terms"], ascending=(True, False)
        ).reset_index(drop=True)
        df_rank["rank"] = df_rank.groupby("topic_id")["count_topic_terms"].rank(
            method="first", ascending=False
        )

        df_rank = df_rank[df_rank["rank"] <= self.max_doc_per_topic]

        # Create a dictionary of TopicRanking objects for each document
        final_dict = {}
        for doc_id, topic_id, rank in zip(
            df_rank["doc_id"], df_rank["topic_id"], df_rank["rank"]
        ):
            res = TopicRanking(topic_id=topic_id, rank=rank)
            final_dict[doc_id] = res

        # Update each document with its topic ranking
        for doc in docs:
            doc.topic_ranking = final_dict.get(doc.doc_id)

        # Create a DataFrame for document content
        df_content = pd.DataFrame.from_records([doc.model_dump() for doc in docs])
        df_content = df_content[["doc_id", "content"]]

        # Merge document content with topic information
        df_topics_rank = pd.merge(df_rank, df_content, on="doc_id")
        df_topics_rank = df_topics_rank[["topic_id", "content"]]
        df_topics_rank = df_topics_rank.groupby("topic_id")["content"].apply(list)

        # Create a dictionary of top document content for each topic
        dict_topic_rank = df_topics_rank.to_dict()

        # Update each topic with its top document content
        for topic in topics:
            topic.top_doc_content = dict_topic_rank.get(topic.topic_id)

        return docs, topics

__init__(ranking_terms=20, max_doc_per_topic=20)

Initialize the class with ranking_terms and max_doc_per_topic parameters.

Parameters:

Name Type Description Default
ranking_terms int

Number of ranking terms to be used.

20
max_doc_per_topic int

Maximum number of documents per topic.

20

Returns:

Type Description
None

None

Source code in bunkatopics/topic_modeling/document_topic_ranker.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
def __init__(self, ranking_terms: int = 20, max_doc_per_topic: int = 20) -> None:
    """
    Initialize the class with ranking_terms and max_doc_per_topic parameters.

    Args:
        ranking_terms (int): Number of ranking terms to be used.
        max_doc_per_topic (int): Maximum number of documents per topic.

    Returns:
        None
    """
    self.ranking_terms = ranking_terms
    self.max_doc_per_topic = max_doc_per_topic

fit_transform(docs, topics)

Calculate top documents for each topic based on ranking terms.

Parameters:

Name Type Description Default
docs List[Document]

List of documents.

required
topics List[Topic]

List of topics.

required

Returns:

Type Description
Tuple[List[Document], List[Topic]]

Tuple[List[Document], List[Topic]]: Updated lists of documents and topics.

Source code in bunkatopics/topic_modeling/document_topic_ranker.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def fit_transform(
    self,
    docs: t.List[Document],
    topics: t.List[Topic],
) -> t.Tuple[t.List[Document], t.List[Topic]]:
    """
    Calculate top documents for each topic based on ranking terms.

    Args:
        docs (List[Document]): List of documents.
        topics (List[Topic]): List of topics.

    Returns:
        Tuple[List[Document], List[Topic]]: Updated lists of documents and topics.
    """
    # Create a DataFrame from the list of documents
    df_docs = pd.DataFrame.from_records([doc.model_dump() for doc in docs])

    # Explode the term_id column to have one row per term
    df_docs = df_docs[["doc_id", "topic_id", "term_id"]]
    df_docs = df_docs.explode("term_id").reset_index(drop=True)

    # Create a DataFrame from the list of topics and truncate term_id
    df_topics = pd.DataFrame.from_records([topic.model_dump() for topic in topics])
    df_topics["term_id"] = df_topics["term_id"].apply(
        lambda x: x[: self.ranking_terms]
    )
    df_topics = df_topics[["topic_id", "term_id"]]
    df_topics = df_topics.explode("term_id").reset_index(drop=True)

    # Merge documents and topics, and calculate term counts
    df_rank = pd.merge(df_docs, df_topics, on=["topic_id", "term_id"])
    df_rank = (
        df_rank.groupby(["topic_id", "doc_id"])["term_id"]
        .count()
        .rename("count_topic_terms")
        .reset_index()
    )

    # Sort and rank documents within each topic
    df_rank = df_rank.sort_values(
        ["topic_id", "count_topic_terms"], ascending=(True, False)
    ).reset_index(drop=True)
    df_rank["rank"] = df_rank.groupby("topic_id")["count_topic_terms"].rank(
        method="first", ascending=False
    )

    df_rank = df_rank[df_rank["rank"] <= self.max_doc_per_topic]

    # Create a dictionary of TopicRanking objects for each document
    final_dict = {}
    for doc_id, topic_id, rank in zip(
        df_rank["doc_id"], df_rank["topic_id"], df_rank["rank"]
    ):
        res = TopicRanking(topic_id=topic_id, rank=rank)
        final_dict[doc_id] = res

    # Update each document with its topic ranking
    for doc in docs:
        doc.topic_ranking = final_dict.get(doc.doc_id)

    # Create a DataFrame for document content
    df_content = pd.DataFrame.from_records([doc.model_dump() for doc in docs])
    df_content = df_content[["doc_id", "content"]]

    # Merge document content with topic information
    df_topics_rank = pd.merge(df_rank, df_content, on="doc_id")
    df_topics_rank = df_topics_rank[["topic_id", "content"]]
    df_topics_rank = df_topics_rank.groupby("topic_id")["content"].apply(list)

    # Create a dictionary of top document content for each topic
    dict_topic_rank = df_topics_rank.to_dict()

    # Update each topic with its top document content
    for topic in topics:
        topic.top_doc_content = dict_topic_rank.get(topic.topic_id)

    return docs, topics