Skip to content

Topic LLM Cleaner

A class for cleaning topic labels using a generative model.

This class utilizes a language model to generate cleaned and more coherent labels for a given list of topics. The cleaning process considers the top documents and terms associated with each topic and optionally includes the actual content of the top documents for a more context-rich label generation.

Source code in bunkatopics/topic_modeling/llm_topic_representation.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class LLMCleaningTopic:
    """
    A class for cleaning topic labels using a generative model.

    This class utilizes a language model to generate cleaned and more coherent labels for a given list of topics.
    The cleaning process considers the top documents and terms associated with each topic and optionally includes
    the actual content of the top documents for a more context-rich label generation.

    """

    def __init__(
        self,
        llm: LLM,
        language: str = "english",
        top_doc: int = 3,
        top_terms: int = 10,
        use_doc: bool = False,
        context: str = "everything",
    ) -> None:
        """
        Initialize the LLMCleaningTopic instance.

        Arguments:
            llm: The generative model to use for label cleaning.
            language (str): Language used for generating labels. Defaults to "english".
            top_doc (int): Number of top documents to consider for each topic. Defaults to 3.
            top_terms (int): Number of top terms to consider for each topic. Defaults to 10.
            use_doc (bool): Whether to include document contents in label generation. Defaults to False.
            context (str): Context for label generation. Defaults to "everything".
        """
        self.llm = llm
        self.language = language
        self.top_doc = top_doc
        self.top_terms = top_terms
        self.use_doc = use_doc
        self.context = context

    def fit_transform(
        self, topics: t.List[Topic], docs: t.List[Document]
    ) -> t.List[Topic]:
        """
        Clean topic labels for a list of topics using the generative model.

        This method processes each topic by generating a new, cleaned label based on the top terms and documents
        associated with the topic. The cleaned labels are then assigned back to the topics.

        Args:
            topics (List[Topic]): List of topics to clean.
            docs (List[Document]): List of documents related to the topics.

        """
        df = _get_df_prompt(topics, docs)

        topic_ids = list(df["topic_id"])
        specific_terms = list(df["keywords"])
        top_doc_contents = list(df["content"])

        final_dict = {}
        pbar = tqdm(total=len(topic_ids), desc="Creating new labels for clusters")
        for topic_ic, x, y in zip(topic_ids, specific_terms, top_doc_contents):
            clean_topic_name = _get_clean_topic(
                llm=self.llm,
                language=self.language,
                specific_terms=x,
                specific_documents=y,
                use_doc=self.use_doc,
                top_terms=self.top_terms,
                top_doc=self.top_doc,
                context=self.context,
            )
            final_dict[topic_ic] = clean_topic_name
            pbar.update(1)

        for topic in topics:
            topic.name = final_dict.get(topic.topic_id)

        return topics

__init__(llm, language='english', top_doc=3, top_terms=10, use_doc=False, context='everything')

Initialize the LLMCleaningTopic instance.

Parameters:

Name Type Description Default
llm LLM

The generative model to use for label cleaning.

required
language str

Language used for generating labels. Defaults to "english".

'english'
top_doc int

Number of top documents to consider for each topic. Defaults to 3.

3
top_terms int

Number of top terms to consider for each topic. Defaults to 10.

10
use_doc bool

Whether to include document contents in label generation. Defaults to False.

False
context str

Context for label generation. Defaults to "everything".

'everything'
Source code in bunkatopics/topic_modeling/llm_topic_representation.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(
    self,
    llm: LLM,
    language: str = "english",
    top_doc: int = 3,
    top_terms: int = 10,
    use_doc: bool = False,
    context: str = "everything",
) -> None:
    """
    Initialize the LLMCleaningTopic instance.

    Arguments:
        llm: The generative model to use for label cleaning.
        language (str): Language used for generating labels. Defaults to "english".
        top_doc (int): Number of top documents to consider for each topic. Defaults to 3.
        top_terms (int): Number of top terms to consider for each topic. Defaults to 10.
        use_doc (bool): Whether to include document contents in label generation. Defaults to False.
        context (str): Context for label generation. Defaults to "everything".
    """
    self.llm = llm
    self.language = language
    self.top_doc = top_doc
    self.top_terms = top_terms
    self.use_doc = use_doc
    self.context = context

fit_transform(topics, docs)

Clean topic labels for a list of topics using the generative model.

This method processes each topic by generating a new, cleaned label based on the top terms and documents associated with the topic. The cleaned labels are then assigned back to the topics.

Parameters:

Name Type Description Default
topics List[Topic]

List of topics to clean.

required
docs List[Document]

List of documents related to the topics.

required
Source code in bunkatopics/topic_modeling/llm_topic_representation.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def fit_transform(
    self, topics: t.List[Topic], docs: t.List[Document]
) -> t.List[Topic]:
    """
    Clean topic labels for a list of topics using the generative model.

    This method processes each topic by generating a new, cleaned label based on the top terms and documents
    associated with the topic. The cleaned labels are then assigned back to the topics.

    Args:
        topics (List[Topic]): List of topics to clean.
        docs (List[Document]): List of documents related to the topics.

    """
    df = _get_df_prompt(topics, docs)

    topic_ids = list(df["topic_id"])
    specific_terms = list(df["keywords"])
    top_doc_contents = list(df["content"])

    final_dict = {}
    pbar = tqdm(total=len(topic_ids), desc="Creating new labels for clusters")
    for topic_ic, x, y in zip(topic_ids, specific_terms, top_doc_contents):
        clean_topic_name = _get_clean_topic(
            llm=self.llm,
            language=self.language,
            specific_terms=x,
            specific_documents=y,
            use_doc=self.use_doc,
            top_terms=self.top_terms,
            top_doc=self.top_doc,
            context=self.context,
        )
        final_dict[topic_ic] = clean_topic_name
        pbar.update(1)

    for topic in topics:
        topic.name = final_dict.get(topic.topic_id)

    return topics