Skip to content

Better Profanity

Profanity

Detect the censor words and hide them with spacial character.

Source code in better_profanity/better_profanity.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
class Profanity:
    """ Detect the censor words and hide them with spacial character."""

    def __init__(self, words=None, special_cases=SPECIAL_CASES):
        """
        Init words and soacial cases.
        Parameters
        ----------
        words : list[str]
            Collection of words or file path for a list of words to censor. `None` to use the default word list.
        special_cases : list[str]
            Special cases to censor
        """
        if (
                words is not None
                and not isinstance(words, str)
                and not isinstance(words, Iterable)
        ):
            raise TypeError("words must be of type str, list, or None")
        self.CENSOR_WORDSET = []
        self.CHARS_MAPPING = {
            "a": ("a", "@", "*", "4"),
            "i": ("i", "*", "l", "1"),
            "o": ("o", "*", "0", "@"),
            "u": ("u", "*", "v"),
            "v": ("v", "*", "u"),
            "l": ("l", "1"),
            "e": ("e", "*", "3"),
            "s": ("s", "$", "5"),
            "t": ("t", "7"),
            "'": ("'", "’")
        }
        self.MAX_NUMBER_COMBINATIONS = 1
        self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS
        self._default_wordlist_filename = get_complete_path_of_file(
            "profanity_wordlist.txt"
        )
        self.SPECIAL_CASES = special_cases
        if type(words) == str:
            self.load_censor_words_from_file(words)
        else:
            self.load_censor_words(custom_words=words)

    ## PUBLIC ##

    def censor(self, text, censor_char="*"):

        """
        Replace the swear words in the text with `censor_char`.
        Parameters
        ----------
        text : str
            Text to censor.
        censor_char : str
            Special cases to censor
        """
        if not isinstance(text, str):
            text = str(text)
        if not isinstance(censor_char, str):
            censor_char = str(censor_char)

        if not self.CENSOR_WORDSET:
            self.load_censor_words()
        return self._hide_swear_words(text, censor_char)

    def load_censor_words_from_file(self, filename, **kwargs):
        """
        Load censor words from file
        Parameters
        ----------
        filename : str
            The file name of censor words
        """
        words = read_wordlist(filename)
        self._populate_words_to_wordset(words, **kwargs)

    def load_censor_words(self, custom_words=None, **kwargs):
        """
        Generate a set of words that need to be censored.
        Parameters
        ----------
        custom_words : list[str]
            Set of words that should be censored.
        """
        # Replace the words from `profanity_wordlist.txt` with a custom list
        custom_words = custom_words or read_wordlist(self._default_wordlist_filename)
        self._populate_words_to_wordset(custom_words, **kwargs)

    def add_censor_words(self, custom_words):
        """
        Add custom censor word to existing set.
        Parameters
        ----------
        custom_words : list[str]
            Custom words to censor
        """

        if not isinstance(custom_words, (list, tuple, set)):
            raise TypeError(
                "Function 'add_censor_words' only accepts list, tuple or set."
            )
        for w in custom_words:
            self.CENSOR_WORDSET.append(VaryingString(w, char_map=self.CHARS_MAPPING))

    def contains_profanity(self, text):

        """
        Return True if  the input text has any swear words.
        Parameters
        ----------
        text : str
            Text to censor
        """
        return text != self.censor(text)

    ## PRIVATE ##

    def _populate_words_to_wordset(self, words, *, whitelist_words=None):
        if whitelist_words is not None and not isinstance(
                whitelist_words, (list, set, tuple)
        ):
            raise TypeError(
                "The 'whitelist_words' keyword argument only accepts list, tuple or set."
            )

        # Validation
        whitelist_words = whitelist_words or []
        for index, word in enumerate(whitelist_words):
            if not isinstance(word, str):
                raise ValueError(
                    "Each word in 'whitelist_words' must be 'str' type, "
                    "but '{word}' found.".format(word=type(word))
                )
            whitelist_words[index] = word.lower()

        # Populate the words into an internal wordset
        whitelist_words = set(whitelist_words)
        all_censor_words = []
        for word in set(words):
            # All words in CENSOR_WORDSET must be in lowercase
            word = word.lower()

            if word in whitelist_words:
                continue

            num_of_non_allowed_chars = self._count_non_allowed_characters(word)
            if num_of_non_allowed_chars > self.MAX_NUMBER_COMBINATIONS:
                self.MAX_NUMBER_COMBINATIONS = num_of_non_allowed_chars

            all_censor_words.append(VaryingString(word, char_map=self.CHARS_MAPPING))

        # The default wordlist takes ~5MB+ of memory
        self.CENSOR_WORDSET = all_censor_words

    def _count_non_allowed_characters(self, word):
        count = 0
        for char in iter(word):
            if char not in self.ALLOWED_CHARACTERS:
                count += 1
        return count

    def _update_next_words_indices(self, text, words_indices, start_idx):
        """
        Return a list of next words_indices after the input index.
        Parameters
        ----------
        text : str
            text to censor
        words_indices : int
            The indices of words
        start_idx : int
            The starting index

        """
        if not words_indices:
            words_indices = self._get_next_words(
                text, start_idx, self.MAX_NUMBER_COMBINATIONS
            )
        else:
            del words_indices[:2]
            if words_indices and words_indices[-1][0] != "":
                words_indices += self._get_next_words(text, words_indices[-1][1], 1)
        return words_indices

    def _hide_swear_words(self, text, censor_char):
        """
        Replace the swear words with censor characters.
        Parameters
        ----------
        text : str
            Text to censor
        censor_char : str
            A character for replacement

        Returns
        -------
        censored_text : str
            Censored text
        """
        censored_text = ""
        cur_word = ""
        skip_index = -1
        next_words_indices = []
        start_idx_of_next_word = self._get_start_index_of_next_word(text, 0)

        # If there are no words in the text, return the raw text without parsing
        if start_idx_of_next_word >= len(text) - 1:
            return text

        # Left strip the text, to avoid inaccurate parsing
        if start_idx_of_next_word > 0:
            censored_text = text[:start_idx_of_next_word]
            text = text[start_idx_of_next_word:]

        # Splitting each word in the text to compare with censored words
        for index, char in iter(enumerate(text)):
            if index < skip_index:
                continue
            if char in ALLOWED_CHARACTERS:
                cur_word += char
                continue

            # Skip continuous non-allowed characters
            if cur_word.strip() == "":
                censored_text += char
                cur_word = ""
                continue

            # Iterate the next words combined with the current one
            # to check if it forms a swear word
            next_words_indices = self._update_next_words_indices(
                text, next_words_indices, index
            )
            contains_swear_word, end_index = any_next_words_form_swear_word(
                cur_word, next_words_indices, self.CENSOR_WORDSET
            )

            if contains_swear_word:
                cur_word = get_replacement_for_swear_word(censor_char)
                skip_index = end_index
                char = ""
                next_words_indices = []

            # If the current a swear word
            if cur_word.lower() in self.CENSOR_WORDSET:
                cur_word = get_replacement_for_swear_word(censor_char)

            censored_text += cur_word + char
            cur_word = ""

        # Final check
        if cur_word != "" and skip_index < len(text) - 1:
            if cur_word.lower() in self.CENSOR_WORDSET:
                cur_word = get_replacement_for_swear_word(censor_char)
            censored_text += cur_word
        return censored_text

    def check_special_cases(self, text):
        """
        Check special cases.
        Parameters
        ----------
        text : str
            Text to check spacial cases

        Returns
        -------
        curse_words : list[str]
            Set of curse words
        """
        curse_words = []
        for case in self.SPECIAL_CASES:
            if case.lower() in text.lower():
                curse_words.append(case)
        return curse_words

    def get_curse_words(self, text):
        """
        Get list of curse words.
        Parameters
        ----------
        text : str
            Text to get curse words

        Returns
        -------
        curse_words : list[str]
            Set of curse words
        """
        text += "."
        curse_words = []
        censored_text = ""
        cur_word = ""
        cur_phrase = ""
        skip_index = -1
        next_words_indices = []
        start_idx_of_next_word = self._get_start_index_of_next_word(text, 0)

        # If there are no words in the text, return the empty list
        if start_idx_of_next_word >= len(text) - 1:
            return []

        # Left strip the text, to avoid inaccurate parsing
        if start_idx_of_next_word > 0:
            censored_text = text[:start_idx_of_next_word]
            text = text[start_idx_of_next_word:]

        # Splitting each word in the text to compare with censored words
        for index, char in iter(enumerate(text)):
            if index < skip_index:
                continue
            if char in ALLOWED_CHARACTERS:
                cur_word += char
                continue

            # Skip continuous non-allowed characters
            if cur_word.strip() == "":
                censored_text += char
                cur_word = ""
                continue

            # Iterate the next words combined with the current one
            # to check if it forms a swear word
            next_words_indices = self._update_next_words_indices(
                text, next_words_indices, index
            )

            contains_swear_word, end_index = any_next_words_form_swear_word(
                cur_word, next_words_indices, self.CENSOR_WORDSET
            )
            if contains_swear_word:
                # cur_word = get_replacement_for_swear_word(censor_char)
                skip_index = end_index
                char = ""
                next_words_indices = []
                cur_phrase = cur_word + text[index:end_index]

            # If the current a swear word
            if cur_phrase.lower() in self.CENSOR_WORDSET:
                curse_words.append(cur_phrase)
            elif cur_word.lower() in self.CENSOR_WORDSET:
                curse_words.append(cur_word)

            censored_text += cur_word + char
            cur_word = ""

        if cur_word != "" and skip_index < len(text) - 1:
            if cur_word.lower() in self.CENSOR_WORDSET:
                curse_words.append(cur_word)

        special_cases = self.check_special_cases(text=text)
        curse_words.extend(special_cases)
        return curse_words

    def _get_start_index_of_next_word(self, text, start_idx):
        """
        Return the index of the first character of the next word in the given text.
        Parameters
        ----------
        text : str
            Text
        start_idx : int
            Start index

        Returns
        -------
        start_idx_of_next_word : int
            Start index of the next word
        """
        start_idx_of_next_word = len(text)
        for index in iter(range(start_idx, len(text))):
            if text[index] not in self.ALLOWED_CHARACTERS:
                continue
            start_idx_of_next_word = index
            break

        return start_idx_of_next_word

    def _get_next_word_and_end_index(self, text, start_idx):
        """
        Return the next word in the given text, and the index of its last character.
        Parameters
        ----------
        text : str
            Text
        start_idx : int
            Start index

        Returns
        -------
        next_word : str
            Next word
        index : int
            The index of next word
        """

        next_word = ""
        index = start_idx
        for index in iter(range(start_idx, len(text))):
            char = text[index]
            if char in self.ALLOWED_CHARACTERS:
                next_word += char
                continue
            break
        return next_word, index

    def _get_next_words(self, text, start_idx, num_of_next_words=1):
        """
        Return a list of pairs of next words and next words included with separators, combined with their end indices.
        For example: Word `hand_job` has next words pairs: `job`, `_job`.
        Parameters
        ----------
        text : str
            Text
        start_idx : int
            Start index
        num_of_next_words : int
            Number of next words

        Returns
        -------
        words : str
            Pairs of next words and next words included with separators
        """

        # Find the starting index of the next word
        start_idx_of_next_word = self._get_start_index_of_next_word(text, start_idx)

        # Return an empty string if there are no other words
        if start_idx_of_next_word >= len(text) - 1:
            return [("", start_idx_of_next_word), ("", start_idx_of_next_word)]

        # Combine the  words into a list
        next_word, end_index = self._get_next_word_and_end_index(
            text, start_idx_of_next_word
        )

        words = [
            (next_word, end_index),
            ("%s%s" % (text[start_idx:start_idx_of_next_word], next_word), end_index),
        ]
        if num_of_next_words > 1:
            words.extend(self._get_next_words(text, end_index, num_of_next_words - 1))

        return words

__init__(words=None, special_cases=SPECIAL_CASES)

Init words and soacial cases.

Parameters:

Name Type Description Default
words list[str]

Collection of words or file path for a list of words to censor. None to use the default word list.

None
special_cases list[str]

Special cases to censor

SPECIAL_CASES
Source code in better_profanity/better_profanity.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def __init__(self, words=None, special_cases=SPECIAL_CASES):
    """
    Init words and soacial cases.
    Parameters
    ----------
    words : list[str]
        Collection of words or file path for a list of words to censor. `None` to use the default word list.
    special_cases : list[str]
        Special cases to censor
    """
    if (
            words is not None
            and not isinstance(words, str)
            and not isinstance(words, Iterable)
    ):
        raise TypeError("words must be of type str, list, or None")
    self.CENSOR_WORDSET = []
    self.CHARS_MAPPING = {
        "a": ("a", "@", "*", "4"),
        "i": ("i", "*", "l", "1"),
        "o": ("o", "*", "0", "@"),
        "u": ("u", "*", "v"),
        "v": ("v", "*", "u"),
        "l": ("l", "1"),
        "e": ("e", "*", "3"),
        "s": ("s", "$", "5"),
        "t": ("t", "7"),
        "'": ("'", "’")
    }
    self.MAX_NUMBER_COMBINATIONS = 1
    self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS
    self._default_wordlist_filename = get_complete_path_of_file(
        "profanity_wordlist.txt"
    )
    self.SPECIAL_CASES = special_cases
    if type(words) == str:
        self.load_censor_words_from_file(words)
    else:
        self.load_censor_words(custom_words=words)

add_censor_words(custom_words)

Add custom censor word to existing set.

Parameters:

Name Type Description Default
custom_words list[str]

Custom words to censor

required
Source code in better_profanity/better_profanity.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def add_censor_words(self, custom_words):
    """
    Add custom censor word to existing set.
    Parameters
    ----------
    custom_words : list[str]
        Custom words to censor
    """

    if not isinstance(custom_words, (list, tuple, set)):
        raise TypeError(
            "Function 'add_censor_words' only accepts list, tuple or set."
        )
    for w in custom_words:
        self.CENSOR_WORDSET.append(VaryingString(w, char_map=self.CHARS_MAPPING))

censor(text, censor_char='*')

Replace the swear words in the text with censor_char.

Parameters:

Name Type Description Default
text str

Text to censor.

required
censor_char str

Special cases to censor

'*'
Source code in better_profanity/better_profanity.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def censor(self, text, censor_char="*"):

    """
    Replace the swear words in the text with `censor_char`.
    Parameters
    ----------
    text : str
        Text to censor.
    censor_char : str
        Special cases to censor
    """
    if not isinstance(text, str):
        text = str(text)
    if not isinstance(censor_char, str):
        censor_char = str(censor_char)

    if not self.CENSOR_WORDSET:
        self.load_censor_words()
    return self._hide_swear_words(text, censor_char)

check_special_cases(text)

Check special cases.

Parameters:

Name Type Description Default
text str

Text to check spacial cases

required

Returns:

Name Type Description
curse_words list[str]

Set of curse words

Source code in better_profanity/better_profanity.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
def check_special_cases(self, text):
    """
    Check special cases.
    Parameters
    ----------
    text : str
        Text to check spacial cases

    Returns
    -------
    curse_words : list[str]
        Set of curse words
    """
    curse_words = []
    for case in self.SPECIAL_CASES:
        if case.lower() in text.lower():
            curse_words.append(case)
    return curse_words

contains_profanity(text)

Return True if the input text has any swear words.

Parameters:

Name Type Description Default
text str

Text to censor

required
Source code in better_profanity/better_profanity.py
119
120
121
122
123
124
125
126
127
128
def contains_profanity(self, text):

    """
    Return True if  the input text has any swear words.
    Parameters
    ----------
    text : str
        Text to censor
    """
    return text != self.censor(text)

get_curse_words(text)

Get list of curse words.

Parameters:

Name Type Description Default
text str

Text to get curse words

required

Returns:

Name Type Description
curse_words list[str]

Set of curse words

Source code in better_profanity/better_profanity.py
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
def get_curse_words(self, text):
    """
    Get list of curse words.
    Parameters
    ----------
    text : str
        Text to get curse words

    Returns
    -------
    curse_words : list[str]
        Set of curse words
    """
    text += "."
    curse_words = []
    censored_text = ""
    cur_word = ""
    cur_phrase = ""
    skip_index = -1
    next_words_indices = []
    start_idx_of_next_word = self._get_start_index_of_next_word(text, 0)

    # If there are no words in the text, return the empty list
    if start_idx_of_next_word >= len(text) - 1:
        return []

    # Left strip the text, to avoid inaccurate parsing
    if start_idx_of_next_word > 0:
        censored_text = text[:start_idx_of_next_word]
        text = text[start_idx_of_next_word:]

    # Splitting each word in the text to compare with censored words
    for index, char in iter(enumerate(text)):
        if index < skip_index:
            continue
        if char in ALLOWED_CHARACTERS:
            cur_word += char
            continue

        # Skip continuous non-allowed characters
        if cur_word.strip() == "":
            censored_text += char
            cur_word = ""
            continue

        # Iterate the next words combined with the current one
        # to check if it forms a swear word
        next_words_indices = self._update_next_words_indices(
            text, next_words_indices, index
        )

        contains_swear_word, end_index = any_next_words_form_swear_word(
            cur_word, next_words_indices, self.CENSOR_WORDSET
        )
        if contains_swear_word:
            # cur_word = get_replacement_for_swear_word(censor_char)
            skip_index = end_index
            char = ""
            next_words_indices = []
            cur_phrase = cur_word + text[index:end_index]

        # If the current a swear word
        if cur_phrase.lower() in self.CENSOR_WORDSET:
            curse_words.append(cur_phrase)
        elif cur_word.lower() in self.CENSOR_WORDSET:
            curse_words.append(cur_word)

        censored_text += cur_word + char
        cur_word = ""

    if cur_word != "" and skip_index < len(text) - 1:
        if cur_word.lower() in self.CENSOR_WORDSET:
            curse_words.append(cur_word)

    special_cases = self.check_special_cases(text=text)
    curse_words.extend(special_cases)
    return curse_words

load_censor_words(custom_words=None, **kwargs)

Generate a set of words that need to be censored.

Parameters:

Name Type Description Default
custom_words list[str]

Set of words that should be censored.

None
Source code in better_profanity/better_profanity.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def load_censor_words(self, custom_words=None, **kwargs):
    """
    Generate a set of words that need to be censored.
    Parameters
    ----------
    custom_words : list[str]
        Set of words that should be censored.
    """
    # Replace the words from `profanity_wordlist.txt` with a custom list
    custom_words = custom_words or read_wordlist(self._default_wordlist_filename)
    self._populate_words_to_wordset(custom_words, **kwargs)

load_censor_words_from_file(filename, **kwargs)

Load censor words from file

Parameters:

Name Type Description Default
filename str

The file name of censor words

required
Source code in better_profanity/better_profanity.py
80
81
82
83
84
85
86
87
88
89
def load_censor_words_from_file(self, filename, **kwargs):
    """
    Load censor words from file
    Parameters
    ----------
    filename : str
        The file name of censor words
    """
    words = read_wordlist(filename)
    self._populate_words_to_wordset(words, **kwargs)

VaryingString

Represents a string with varying character representations.

Source code in better_profanity/varying_string.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class VaryingString:
    """Represents a string with varying character representations."""

    def __init__(self, string, char_map={}):

        self._original = string

        # There is not necessarily a single length for all of this string's variants.
        # Some character substitutions may include more than one character or empty
        # substitutions.
        self._min_len = 0
        self._max_len = 0

        # Create list of all possible character combinations.
        self._char_combos = []
        for char in self._original:
            if char in char_map:
                self._char_combos.append(char_map[char])
                lens = [len(c) for c in char_map[char]]
                self._min_len += min(lens)
                self._max_len += max(lens)
            else:
                self._char_combos.append((char,))
                self._min_len += 1
                self._max_len += 1

    def __str__(self):
        return self._original

    def __eq__(self, other):
        if self is other:
            return True
        elif other.__class__ == VaryingString:
            # We have no use case for this yet.
            raise NotImplementedError
        elif other.__class__ == str:
            len_other = len(other)
            if len_other < self._min_len or len_other > self._max_len:
                return False
            # We use a list of slices instead of a single slices to account for
            # character substitutions that contain multiple characters.
            slices = [other]
            for chars in self._char_combos:
                new_slices = []
                for char in chars:
                    if not char:
                        new_slices.extend(slices)
                    len_char = len(char)
                    for sl in slices:
                        if sl[:len_char] == char:
                            new_slices.append(sl[len_char:])
                if len(new_slices) == 0:
                    return False
                slices = new_slices
            for sl in slices:
                if len(sl) == 0:
                    return True
            return False
        else:
            return False