Skip to content

Topic Extractor

Extracts terms from text using Textacy and SpaCy libraries.

This class provides functionalities to extract terms from a given list of documents, considering various linguistic features like n-grams, named entities, and noun chunks.

Source code in bunkatopics/topic_modeling/term_extractor.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
class TextacyTermsExtractor:
    """
    Extracts terms from text using Textacy and SpaCy libraries.

    This class provides functionalities to extract terms from a given list of documents,
    considering various linguistic features like n-grams, named entities, and noun chunks.

    """

    def __init__(
        self,
        ngrams: t.List[int] = [1, 2, 3],
        ngs: bool = True,
        ents: bool = False,
        ncs: bool = False,
        drop_emoji: bool = True,
        include_pos: t.List[str] = ["NOUN"],
        include_types: t.List[str] = ["PERSON", "ORG"],
        language: str = "en",
    ):
        """
        Initializes the TextacyTermsExtractor with specified configuration.

        Args:
            ngrams (tuple[int, ...]): Tuple of n-gram lengths to consider. Defaults to (1, 2, 3).
            ngs (bool): Include n-grams in extraction. Defaults to True.
            ents (bool): Include named entities in extraction. Defaults to True.
            ncs (bool): Include noun chunks in extraction. Defaults to True.
            drop_emoji (bool): Remove emojis before extraction. Defaults to True.
            include_pos (list[str]): POS tags to include. Defaults to ["NOUN"].
            include_types (list[str]): Entity types to include. Defaults to ["PERSON", "ORG"].

        Raises:
            ValueError: If the specified language is not supported.
        """

        self.ngs = ngs
        self.ents = ents
        self.ncs = ncs
        self.drop_emoji = drop_emoji
        self.include_pos = include_pos
        self.include_types = include_types
        self.ngrams = ngrams
        self.language = language

    def fit_transform(
        self,
        ids: t.List[DOC_ID],
        sentences: t.List[str],
    ) -> t.Tuple[t.List[Term], t.Dict[DOC_ID, t.List[TERM_ID]]]:
        """
        Extracts terms from the provided documents and returns them along with their indices.

        Args:
            ids (List[DOC_ID]): List of document IDs.
            sentences (List[str]): List of sentences corresponding to the document IDs.

        Notes:
            - The method processes each document to extract relevant terms based on the configured
            linguistic features such as n-grams, named entities, and noun chunks.
            - It also handles pre-processing steps like normalizing text, removing brackets,
            replacing currency symbols, removing HTML tags, and optionally dropping emojis.
        """

        self.language_model = detect_language_to_spacy_model.get(self.language)

        if self.language_model is None:
            logger.info(
                "We could not find the adapted model, we set to en_core_web_sm (English) as default"
            )
            self.language_model = "en_core_web_sm"

        try:
            spacy.load(self.language_model)
        except OSError:
            # The model is not installed, so download it
            spacy.cli.download(self.language_model)

        # Create a DataFrame from the provided document IDs and sentences
        self.df = pd.DataFrame({"content": sentences, "doc_id": ids})

        # Extract terms from the DataFrame
        df_terms, df_terms_indexed = self.extract_terms_df(
            self.df,
            text_var="content",
            index_var="doc_id",
            ngs=self.ngs,
            ents=self.ents,
            ncs=self.ncs,
            drop_emoji=self.drop_emoji,
            ngrams=self.ngrams,
            remove_punctuation=True,
            include_pos=self.include_pos,
            include_types=self.include_types,
            language_model=self.language_model,
        )

        # Process and return the extracted terms
        df_terms = df_terms.reset_index().rename(columns={"terms_indexed": "term_id"})
        terms = [Term(**row) for row in df_terms.to_dict(orient="records")]
        self.terms: t.List[Term] = terms

        df_terms_indexed = df_terms_indexed.reset_index().rename(
            columns={"text": "terms_indexed"}
        )
        indexed_terms_dict = df_terms_indexed.set_index("doc_id")[
            "terms_indexed"
        ].to_dict()

        return terms, indexed_terms_dict

    def extract_terms_df(
        self,
        data: pd.DataFrame,
        text_var: str,
        index_var: str,
        ngs: bool = True,
        ents: bool = True,
        ncs: bool = False,
        drop_emoji: bool = True,
        ngrams: t.Tuple[int, int] = (2, 2),
        remove_punctuation: bool = False,
        include_pos: t.List[str] = ["NOUN", "PROPN", "ADJ"],
        include_types: t.List[str] = ["PERSON", "ORG"],
        language_model: str = "en_core_web_sm",
    ) -> t.Tuple[pd.DataFrame, pd.DataFrame]:
        load_lang = textacy.load_spacy_lang(language_model, disable=())

        def extract_terms(
            tuple: t.Tuple[int, str],
            ngs: bool,
            ents: bool,
            ncs: bool,
            ngrams: t.Tuple[int, int],
            drop_emoji: bool,
            remove_punctuation: bool,
            include_pos: t.List[str],
            include_types: t.List[str],
        ) -> pd.DataFrame:
            index = tuple[0]
            text = tuple[1]

            prepro_text = preproc(str(text))
            if drop_emoji:
                prepro_text = textacy.preprocessing.replace.emojis(prepro_text, repl="")

            if remove_punctuation:
                prepro_text = textacy.preprocessing.remove.punctuation(prepro_text)

            doc = textacy.make_spacy_doc(prepro_text, lang=load_lang)

            terms = []

            if ngs:
                ngrams_terms = list(
                    textacy.extract.terms(
                        doc,
                        ngs=partial(
                            textacy.extract.ngrams,
                            n=ngrams,
                            filter_punct=True,
                            filter_stops=True,
                            include_pos=include_pos,
                        ),
                        dedupe=False,
                    )
                )

                terms.append(ngrams_terms)

            if ents:
                ents_terms = list(
                    textacy.extract.terms(
                        doc,
                        ents=partial(
                            textacy.extract.entities, include_types=include_types
                        ),
                        dedupe=False,
                    )
                )
                terms.append(ents_terms)

            if ncs:
                ncs_terms = list(
                    textacy.extract.terms(
                        doc,
                        ncs=partial(textacy.extract.noun_chunks, drop_determiners=True),
                        dedupe=False,
                    )
                )

                noun_chunks = [x for x in ncs_terms if len(x) >= 3]
                terms.append(noun_chunks)

            final = [item for sublist in terms for item in sublist]
            final = list(set(final))

            df = [
                (term.text, term.lemma_.lower(), term.label_, term.__len__())
                for term in final
            ]
            df = pd.DataFrame(df, columns=["text", "lemma", "ent", "ngrams"])
            df["text_index"] = index

            return df

        data = data[data[text_var].notna()]

        sentences = data[text_var].tolist()
        indexes = data[index_var].tolist()
        inputs = [(x, y) for x, y in zip(indexes, sentences)]

        res = list(
            tqdm(
                map(
                    partial(
                        extract_terms,
                        ngs=ngs,
                        ents=ents,
                        ncs=ncs,
                        drop_emoji=drop_emoji,
                        remove_punctuation=remove_punctuation,
                        ngrams=ngrams,
                        include_pos=include_pos,
                        include_types=include_types,
                    ),
                    inputs,
                ),
                total=len(inputs),
            )
        )

        final_res = pd.concat([x for x in res])

        terms = (
            final_res.groupby(["text", "lemma", "ent", "ngrams"])
            .agg(count_terms=("text_index", "count"))
            .reset_index()
        )

        terms = terms.sort_values(["text", "ent"]).reset_index(drop=True)
        terms = terms.drop_duplicates(["text"], keep="first")
        terms = terms.sort_values("count_terms", ascending=False)
        terms = terms.rename(columns={"text": "terms_indexed"})
        terms = terms.set_index("terms_indexed")

        terms_indexed = final_res[["text", "text_index"]].drop_duplicates()
        terms_indexed = terms_indexed.rename(columns={"text_index": index_var})
        terms_indexed = terms_indexed.groupby(index_var)["text"].apply(list)
        terms_indexed = terms_indexed.reset_index()
        terms_indexed = terms_indexed.rename(columns={"text": "terms_indexed"})
        terms_indexed = terms_indexed.set_index(index_var)

        return terms, terms_indexed

__init__(ngrams=[1, 2, 3], ngs=True, ents=False, ncs=False, drop_emoji=True, include_pos=['NOUN'], include_types=['PERSON', 'ORG'], language='en')

Initializes the TextacyTermsExtractor with specified configuration.

Parameters:

Name Type Description Default
ngrams tuple[int, ...]

Tuple of n-gram lengths to consider. Defaults to (1, 2, 3).

[1, 2, 3]
ngs bool

Include n-grams in extraction. Defaults to True.

True
ents bool

Include named entities in extraction. Defaults to True.

False
ncs bool

Include noun chunks in extraction. Defaults to True.

False
drop_emoji bool

Remove emojis before extraction. Defaults to True.

True
include_pos list[str]

POS tags to include. Defaults to ["NOUN"].

['NOUN']
include_types list[str]

Entity types to include. Defaults to ["PERSON", "ORG"].

['PERSON', 'ORG']

Raises:

Type Description
ValueError

If the specified language is not supported.

Source code in bunkatopics/topic_modeling/term_extractor.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def __init__(
    self,
    ngrams: t.List[int] = [1, 2, 3],
    ngs: bool = True,
    ents: bool = False,
    ncs: bool = False,
    drop_emoji: bool = True,
    include_pos: t.List[str] = ["NOUN"],
    include_types: t.List[str] = ["PERSON", "ORG"],
    language: str = "en",
):
    """
    Initializes the TextacyTermsExtractor with specified configuration.

    Args:
        ngrams (tuple[int, ...]): Tuple of n-gram lengths to consider. Defaults to (1, 2, 3).
        ngs (bool): Include n-grams in extraction. Defaults to True.
        ents (bool): Include named entities in extraction. Defaults to True.
        ncs (bool): Include noun chunks in extraction. Defaults to True.
        drop_emoji (bool): Remove emojis before extraction. Defaults to True.
        include_pos (list[str]): POS tags to include. Defaults to ["NOUN"].
        include_types (list[str]): Entity types to include. Defaults to ["PERSON", "ORG"].

    Raises:
        ValueError: If the specified language is not supported.
    """

    self.ngs = ngs
    self.ents = ents
    self.ncs = ncs
    self.drop_emoji = drop_emoji
    self.include_pos = include_pos
    self.include_types = include_types
    self.ngrams = ngrams
    self.language = language

fit_transform(ids, sentences)

Extracts terms from the provided documents and returns them along with their indices.

Parameters:

Name Type Description Default
ids List[DOC_ID]

List of document IDs.

required
sentences List[str]

List of sentences corresponding to the document IDs.

required
Notes
  • The method processes each document to extract relevant terms based on the configured linguistic features such as n-grams, named entities, and noun chunks.
  • It also handles pre-processing steps like normalizing text, removing brackets, replacing currency symbols, removing HTML tags, and optionally dropping emojis.
Source code in bunkatopics/topic_modeling/term_extractor.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def fit_transform(
    self,
    ids: t.List[DOC_ID],
    sentences: t.List[str],
) -> t.Tuple[t.List[Term], t.Dict[DOC_ID, t.List[TERM_ID]]]:
    """
    Extracts terms from the provided documents and returns them along with their indices.

    Args:
        ids (List[DOC_ID]): List of document IDs.
        sentences (List[str]): List of sentences corresponding to the document IDs.

    Notes:
        - The method processes each document to extract relevant terms based on the configured
        linguistic features such as n-grams, named entities, and noun chunks.
        - It also handles pre-processing steps like normalizing text, removing brackets,
        replacing currency symbols, removing HTML tags, and optionally dropping emojis.
    """

    self.language_model = detect_language_to_spacy_model.get(self.language)

    if self.language_model is None:
        logger.info(
            "We could not find the adapted model, we set to en_core_web_sm (English) as default"
        )
        self.language_model = "en_core_web_sm"

    try:
        spacy.load(self.language_model)
    except OSError:
        # The model is not installed, so download it
        spacy.cli.download(self.language_model)

    # Create a DataFrame from the provided document IDs and sentences
    self.df = pd.DataFrame({"content": sentences, "doc_id": ids})

    # Extract terms from the DataFrame
    df_terms, df_terms_indexed = self.extract_terms_df(
        self.df,
        text_var="content",
        index_var="doc_id",
        ngs=self.ngs,
        ents=self.ents,
        ncs=self.ncs,
        drop_emoji=self.drop_emoji,
        ngrams=self.ngrams,
        remove_punctuation=True,
        include_pos=self.include_pos,
        include_types=self.include_types,
        language_model=self.language_model,
    )

    # Process and return the extracted terms
    df_terms = df_terms.reset_index().rename(columns={"terms_indexed": "term_id"})
    terms = [Term(**row) for row in df_terms.to_dict(orient="records")]
    self.terms: t.List[Term] = terms

    df_terms_indexed = df_terms_indexed.reset_index().rename(
        columns={"text": "terms_indexed"}
    )
    indexed_terms_dict = df_terms_indexed.set_index("doc_id")[
        "terms_indexed"
    ].to_dict()

    return terms, indexed_terms_dict