Better Profanity

`Profanity`

Detect the censor words and hide them with spacial character.

Source code in better_profanity/better_profanity.py

class Profanity:
    """ Detect the censor words and hide them with spacial character."""

    def __init__(self, words=None, special_cases=SPECIAL_CASES):
        """
        Init words and soacial cases.
        Parameters
        ----------
        words : list[str]
            Collection of words or file path for a list of words to censor. `None` to use the default word list.
        special_cases : list[str]
            Special cases to censor
        """
        if (
                words is not None
                and not isinstance(words, str)
                and not isinstance(words, Iterable)
        ):
            raise TypeError("words must be of type str, list, or None")
        self.CENSOR_WORDSET = []
        self.CHARS_MAPPING = {
            "a": ("a", "@", "*", "4"),
            "i": ("i", "*", "l", "1"),
            "o": ("o", "*", "0", "@"),
            "u": ("u", "*", "v"),
            "v": ("v", "*", "u"),
            "l": ("l", "1"),
            "e": ("e", "*", "3"),
            "s": ("s", "$", "5"),
            "t": ("t", "7"),
            "'": ("'", "’")
        }
        self.MAX_NUMBER_COMBINATIONS = 1
        self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS
        self._default_wordlist_filename = get_complete_path_of_file(
            "profanity_wordlist.txt"
        )
        self.SPECIAL_CASES = special_cases
        if type(words) == str:
            self.load_censor_words_from_file(words)
        else:
            self.load_censor_words(custom_words=words)

    ## PUBLIC ##

    def censor(self, text, censor_char="*"):

        """
        Replace the swear words in the text with `censor_char`.
        Parameters
        ----------
        text : str
            Text to censor.
        censor_char : str
            Special cases to censor
        """
        if not isinstance(text, str):
            text = str(text)
        if not isinstance(censor_char, str):
            censor_char = str(censor_char)

        if not self.CENSOR_WORDSET:
            self.load_censor_words()
        return self._hide_swear_words(text, censor_char)

    def load_censor_words_from_file(self, filename, **kwargs):
        """
        Load censor words from file
        Parameters
        ----------
        filename : str
            The file name of censor words
        """
        words = read_wordlist(filename)
        self._populate_words_to_wordset(words, **kwargs)

    def load_censor_words(self, custom_words=None, **kwargs):
        """
        Generate a set of words that need to be censored.
        Parameters
        ----------
        custom_words : list[str]
            Set of words that should be censored.
        """
        # Replace the words from `profanity_wordlist.txt` with a custom list
        custom_words = custom_words or read_wordlist(self._default_wordlist_filename)
        self._populate_words_to_wordset(custom_words, **kwargs)

    def add_censor_words(self, custom_words):
        """
        Add custom censor word to existing set.
        Parameters
        ----------
        custom_words : list[str]
            Custom words to censor
        """

        if not isinstance(custom_words, (list, tuple, set)):
            raise TypeError(
                "Function 'add_censor_words' only accepts list, tuple or set."
            )
        for w in custom_words:
            self.CENSOR_WORDSET.append(VaryingString(w, char_map=self.CHARS_MAPPING))

    def contains_profanity(self, text):

        """
        Return True if  the input text has any swear words.
        Parameters
        ----------
        text : str
            Text to censor
        """
        return text != self.censor(text)

    ## PRIVATE ##

    def _populate_words_to_wordset(self, words, *, whitelist_words=None):
        if whitelist_words is not None and not isinstance(
                whitelist_words, (list, set, tuple)
        ):
            raise TypeError(
                "The 'whitelist_words' keyword argument only accepts list, tuple or set."
            )

        # Validation
        whitelist_words = whitelist_words or []
        for index, word in enumerate(whitelist_words):
            if not isinstance(word, str):
                raise ValueError(
                    "Each word in 'whitelist_words' must be 'str' type, "
                    "but '{word}' found.".format(word=type(word))
                )
            whitelist_words[index] = word.lower()

        # Populate the words into an internal wordset
        whitelist_words = set(whitelist_words)
        all_censor_words = []
        for word in set(words):
            # All words in CENSOR_WORDSET must be in lowercase
            word = word.lower()

            if word in whitelist_words:
                continue

            num_of_non_allowed_chars = self._count_non_allowed_characters(word)
            if num_of_non_allowed_chars > self.MAX_NUMBER_COMBINATIONS:
                self.MAX_NUMBER_COMBINATIONS = num_of_non_allowed_chars

            all_censor_words.append(VaryingString(word, char_map=self.CHARS_MAPPING))

        # The default wordlist takes ~5MB+ of memory
        self.CENSOR_WORDSET = all_censor_words

    def _count_non_allowed_characters(self, word):
        count = 0
        for char in iter(word):
            if char not in self.ALLOWED_CHARACTERS:
                count += 1
        return count

    def _update_next_words_indices(self, text, words_indices, start_idx):
        """
        Return a list of next words_indices after the input index.
        Parameters
        ----------
        text : str
            text to censor
        words_indices : int
            The indices of words
        start_idx : int
            The starting index

        """
        if not words_indices:
            words_indices = self._get_next_words(
                text, start_idx, self.MAX_NUMBER_COMBINATIONS
            )
        else:
            del words_indices[:2]
            if words_indices and words_indices[-1][0] != "":
                words_indices += self._get_next_words(text, words_indices[-1][1], 1)
        return words_indices

    def _hide_swear_words(self, text, censor_char):
        """
        Replace the swear words with censor characters.
        Parameters
        ----------
        text : str
            Text to censor
        censor_char : str
            A character for replacement

        Returns
        -------
        censored_text : str
            Censored text
        """
        censored_text = ""
        cur_word = ""
        skip_index = -1
        next_words_indices = []
        start_idx_of_next_word = self._get_start_index_of_next_word(text, 0)

        # If there are no words in the text, return the raw text without parsing
        if start_idx_of_next_word >= len(text) - 1:
            return text

        # Left strip the text, to avoid inaccurate parsing
        if start_idx_of_next_word > 0:
            censored_text = text[:start_idx_of_next_word]
            text = text[start_idx_of_next_word:]

        # Splitting each word in the text to compare with censored words
        for index, char in iter(enumerate(text)):
            if index < skip_index:
                continue
            if char in ALLOWED_CHARACTERS:
                cur_word += char
                continue

            # Skip continuous non-allowed characters
            if cur_word.strip() == "":
                censored_text += char
                cur_word = ""
                continue

            # Iterate the next words combined with the current one
            # to check if it forms a swear word
            next_words_indices = self._update_next_words_indices(
                text, next_words_indices, index
            )
            contains_swear_word, end_index = any_next_words_form_swear_word(
                cur_word, next_words_indices, self.CENSOR_WORDSET
            )

            if contains_swear_word:
                cur_word = get_replacement_for_swear_word(censor_char)
                skip_index = end_index
                char = ""
                next_words_indices = []

            # If the current a swear word
            if cur_word.lower() in self.CENSOR_WORDSET:
                cur_word = get_replacement_for_swear_word(censor_char)

            censored_text += cur_word + char
            cur_word = ""

        # Final check
        if cur_word != "" and skip_index < len(text) - 1:
            if cur_word.lower() in self.CENSOR_WORDSET:
                cur_word = get_replacement_for_swear_word(censor_char)
            censored_text += cur_word
        return censored_text

    def check_special_cases(self, text):
        """
        Check special cases.
        Parameters
        ----------
        text : str
            Text to check spacial cases

        Returns
        -------
        curse_words : list[str]
            Set of curse words
        """
        curse_words = []
        for case in self.SPECIAL_CASES:
            if case.lower() in text.lower():
                curse_words.append(case)
        return curse_words

    def get_curse_words(self, text):
        """
        Get list of curse words.
        Parameters
        ----------
        text : str
            Text to get curse words

        Returns
        -------
        curse_words : list[str]
            Set of curse words
        """
        text += "."
        curse_words = []
        censored_text = ""
        cur_word = ""
        cur_phrase = ""
        skip_index = -1
        next_words_indices = []
        start_idx_of_next_word = self._get_start_index_of_next_word(text, 0)

        # If there are no words in the text, return the empty list
        if start_idx_of_next_word >= len(text) - 1:
            return []

        # Left strip the text, to avoid inaccurate parsing
        if start_idx_of_next_word > 0:
            censored_text = text[:start_idx_of_next_word]
            text = text[start_idx_of_next_word:]

        # Splitting each word in the text to compare with censored words
        for index, char in iter(enumerate(text)):
            if index < skip_index:
                continue
            if char in ALLOWED_CHARACTERS:
                cur_word += char
                continue

            # Skip continuous non-allowed characters
            if cur_word.strip() == "":
                censored_text += char
                cur_word = ""
                continue

            # Iterate the next words combined with the current one
            # to check if it forms a swear word
            next_words_indices = self._update_next_words_indices(
                text, next_words_indices, index
            )

            contains_swear_word, end_index = any_next_words_form_swear_word(
                cur_word, next_words_indices, self.CENSOR_WORDSET
            )
            if contains_swear_word:
                # cur_word = get_replacement_for_swear_word(censor_char)
                skip_index = end_index
                char = ""
                next_words_indices = []
                cur_phrase = cur_word + text[index:end_index]

            # If the current a swear word
            if cur_phrase.lower() in self.CENSOR_WORDSET:
                curse_words.append(cur_phrase)
            elif cur_word.lower() in self.CENSOR_WORDSET:
                curse_words.append(cur_word)

            censored_text += cur_word + char
            cur_word = ""

        if cur_word != "" and skip_index < len(text) - 1:
            if cur_word.lower() in self.CENSOR_WORDSET:
                curse_words.append(cur_word)

        special_cases = self.check_special_cases(text=text)
        curse_words.extend(special_cases)
        return curse_words

    def _get_start_index_of_next_word(self, text, start_idx):
        """
        Return the index of the first character of the next word in the given text.
        Parameters
        ----------
        text : str
            Text
        start_idx : int
            Start index

        Returns
        -------
        start_idx_of_next_word : int
            Start index of the next word
        """
        start_idx_of_next_word = len(text)
        for index in iter(range(start_idx, len(text))):
            if text[index] not in self.ALLOWED_CHARACTERS:
                continue
            start_idx_of_next_word = index
            break

        return start_idx_of_next_word

    def _get_next_word_and_end_index(self, text, start_idx):
        """
        Return the next word in the given text, and the index of its last character.
        Parameters
        ----------
        text : str
            Text
        start_idx : int
            Start index

        Returns
        -------
        next_word : str
            Next word
        index : int
            The index of next word
        """

        next_word = ""
        index = start_idx
        for index in iter(range(start_idx, len(text))):
            char = text[index]
            if char in self.ALLOWED_CHARACTERS:
                next_word += char
                continue
            break
        return next_word, index

    def _get_next_words(self, text, start_idx, num_of_next_words=1):
        """
        Return a list of pairs of next words and next words included with separators, combined with their end indices.
        For example: Word `hand_job` has next words pairs: `job`, `_job`.
        Parameters
        ----------
        text : str
            Text
        start_idx : int
            Start index
        num_of_next_words : int
            Number of next words

        Returns
        -------
        words : str
            Pairs of next words and next words included with separators
        """

        # Find the starting index of the next word
        start_idx_of_next_word = self._get_start_index_of_next_word(text, start_idx)

        # Return an empty string if there are no other words
        if start_idx_of_next_word >= len(text) - 1:
            return [("", start_idx_of_next_word), ("", start_idx_of_next_word)]

        # Combine the  words into a list
        next_word, end_index = self._get_next_word_and_end_index(
            text, start_idx_of_next_word
        )

        words = [
            (next_word, end_index),
            ("%s%s" % (text[start_idx:start_idx_of_next_word], next_word), end_index),
        ]
        if num_of_next_words > 1:
            words.extend(self._get_next_words(text, end_index, num_of_next_words - 1))

        return words

`init(words=None, special_cases=SPECIAL_CASES)`

Init words and soacial cases.

Parameters:

Name	Type	Description	Default
`words`	`list[str]`	Collection of words or file path for a list of words to censor. `None` to use the default word list.	`None`
`special_cases`	`list[str]`	Special cases to censor	`SPECIAL_CASES`

Source code in better_profanity/better_profanity.py

def __init__(self, words=None, special_cases=SPECIAL_CASES):
    """
    Init words and soacial cases.
    Parameters
    ----------
    words : list[str]
        Collection of words or file path for a list of words to censor. `None` to use the default word list.
    special_cases : list[str]
        Special cases to censor
    """
    if (
            words is not None
            and not isinstance(words, str)
            and not isinstance(words, Iterable)
    ):
        raise TypeError("words must be of type str, list, or None")
    self.CENSOR_WORDSET = []
    self.CHARS_MAPPING = {
        "a": ("a", "@", "*", "4"),
        "i": ("i", "*", "l", "1"),
        "o": ("o", "*", "0", "@"),
        "u": ("u", "*", "v"),
        "v": ("v", "*", "u"),
        "l": ("l", "1"),
        "e": ("e", "*", "3"),
        "s": ("s", "$", "5"),
        "t": ("t", "7"),
        "'": ("'", "’")
    }
    self.MAX_NUMBER_COMBINATIONS = 1
    self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS
    self._default_wordlist_filename = get_complete_path_of_file(
        "profanity_wordlist.txt"
    )
    self.SPECIAL_CASES = special_cases
    if type(words) == str:
        self.load_censor_words_from_file(words)
    else:
        self.load_censor_words(custom_words=words)

`add_censor_words(custom_words)`

Add custom censor word to existing set.

Parameters:

Name	Type	Description	Default
`custom_words`	`list[str]`	Custom words to censor	required

Source code in better_profanity/better_profanity.py

def add_censor_words(self, custom_words):
    """
    Add custom censor word to existing set.
    Parameters
    ----------
    custom_words : list[str]
        Custom words to censor
    """

    if not isinstance(custom_words, (list, tuple, set)):
        raise TypeError(
            "Function 'add_censor_words' only accepts list, tuple or set."
        )
    for w in custom_words:
        self.CENSOR_WORDSET.append(VaryingString(w, char_map=self.CHARS_MAPPING))

`censor(text, censor_char='*')`

Replace the swear words in the text with censor_char.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to censor.	required
`censor_char`	`str`	Special cases to censor	`'*'`

Source code in better_profanity/better_profanity.py

def censor(self, text, censor_char="*"):

    """
    Replace the swear words in the text with `censor_char`.
    Parameters
    ----------
    text : str
        Text to censor.
    censor_char : str
        Special cases to censor
    """
    if not isinstance(text, str):
        text = str(text)
    if not isinstance(censor_char, str):
        censor_char = str(censor_char)

    if not self.CENSOR_WORDSET:
        self.load_censor_words()
    return self._hide_swear_words(text, censor_char)

`check_special_cases(text)`

Check special cases.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to check spacial cases	required

Returns:

Name	Type	Description
`curse_words`	`list[str]`	Set of curse words

Source code in better_profanity/better_profanity.py

def check_special_cases(self, text):
    """
    Check special cases.
    Parameters
    ----------
    text : str
        Text to check spacial cases

    Returns
    -------
    curse_words : list[str]
        Set of curse words
    """
    curse_words = []
    for case in self.SPECIAL_CASES:
        if case.lower() in text.lower():
            curse_words.append(case)
    return curse_words

`contains_profanity(text)`

Return True if the input text has any swear words.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to censor	required

Source code in better_profanity/better_profanity.py

def contains_profanity(self, text):

    """
    Return True if  the input text has any swear words.
    Parameters
    ----------
    text : str
        Text to censor
    """
    return text != self.censor(text)

`get_curse_words(text)`

Get list of curse words.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to get curse words	required

Returns:

Name	Type	Description
`curse_words`	`list[str]`	Set of curse words

Source code in better_profanity/better_profanity.py

def get_curse_words(self, text):
    """
    Get list of curse words.
    Parameters
    ----------
    text : str
        Text to get curse words

    Returns
    -------
    curse_words : list[str]
        Set of curse words
    """
    text += "."
    curse_words = []
    censored_text = ""
    cur_word = ""
    cur_phrase = ""
    skip_index = -1
    next_words_indices = []
    start_idx_of_next_word = self._get_start_index_of_next_word(text, 0)

    # If there are no words in the text, return the empty list
    if start_idx_of_next_word >= len(text) - 1:
        return []

    # Left strip the text, to avoid inaccurate parsing
    if start_idx_of_next_word > 0:
        censored_text = text[:start_idx_of_next_word]
        text = text[start_idx_of_next_word:]

    # Splitting each word in the text to compare with censored words
    for index, char in iter(enumerate(text)):
        if index < skip_index:
            continue
        if char in ALLOWED_CHARACTERS:
            cur_word += char
            continue

        # Skip continuous non-allowed characters
        if cur_word.strip() == "":
            censored_text += char
            cur_word = ""
            continue

        # Iterate the next words combined with the current one
        # to check if it forms a swear word
        next_words_indices = self._update_next_words_indices(
            text, next_words_indices, index
        )

        contains_swear_word, end_index = any_next_words_form_swear_word(
            cur_word, next_words_indices, self.CENSOR_WORDSET
        )
        if contains_swear_word:
            # cur_word = get_replacement_for_swear_word(censor_char)
            skip_index = end_index
            char = ""
            next_words_indices = []
            cur_phrase = cur_word + text[index:end_index]

        # If the current a swear word
        if cur_phrase.lower() in self.CENSOR_WORDSET:
            curse_words.append(cur_phrase)
        elif cur_word.lower() in self.CENSOR_WORDSET:
            curse_words.append(cur_word)

        censored_text += cur_word + char
        cur_word = ""

    if cur_word != "" and skip_index < len(text) - 1:
        if cur_word.lower() in self.CENSOR_WORDSET:
            curse_words.append(cur_word)

    special_cases = self.check_special_cases(text=text)
    curse_words.extend(special_cases)
    return curse_words

`load_censor_words(custom_words=None, **kwargs)`

Generate a set of words that need to be censored.

Parameters:

Name	Type	Description	Default
`custom_words`	`list[str]`	Set of words that should be censored.	`None`

Source code in better_profanity/better_profanity.py

def load_censor_words(self, custom_words=None, **kwargs):
    """
    Generate a set of words that need to be censored.
    Parameters
    ----------
    custom_words : list[str]
        Set of words that should be censored.
    """
    # Replace the words from `profanity_wordlist.txt` with a custom list
    custom_words = custom_words or read_wordlist(self._default_wordlist_filename)
    self._populate_words_to_wordset(custom_words, **kwargs)

`load_censor_words_from_file(filename, **kwargs)`

Load censor words from file

Parameters:

Name	Type	Description	Default
`filename`	`str`	The file name of censor words	required

Source code in better_profanity/better_profanity.py

def load_censor_words_from_file(self, filename, **kwargs):
    """
    Load censor words from file
    Parameters
    ----------
    filename : str
        The file name of censor words
    """
    words = read_wordlist(filename)
    self._populate_words_to_wordset(words, **kwargs)

`VaryingString`

Represents a string with varying character representations.

Source code in better_profanity/varying_string.py

class VaryingString:
    """Represents a string with varying character representations."""

    def __init__(self, string, char_map={}):

        self._original = string

        # There is not necessarily a single length for all of this string's variants.
        # Some character substitutions may include more than one character or empty
        # substitutions.
        self._min_len = 0
        self._max_len = 0

        # Create list of all possible character combinations.
        self._char_combos = []
        for char in self._original:
            if char in char_map:
                self._char_combos.append(char_map[char])
                lens = [len(c) for c in char_map[char]]
                self._min_len += min(lens)
                self._max_len += max(lens)
            else:
                self._char_combos.append((char,))
                self._min_len += 1
                self._max_len += 1

    def __str__(self):
        return self._original

    def __eq__(self, other):
        if self is other:
            return True
        elif other.__class__ == VaryingString:
            # We have no use case for this yet.
            raise NotImplementedError
        elif other.__class__ == str:
            len_other = len(other)
            if len_other < self._min_len or len_other > self._max_len:
                return False
            # We use a list of slices instead of a single slices to account for
            # character substitutions that contain multiple characters.
            slices = [other]
            for chars in self._char_combos:
                new_slices = []
                for char in chars:
                    if not char:
                        new_slices.extend(slices)
                    len_char = len(char)
                    for sl in slices:
                        if sl[:len_char] == char:
                            new_slices.append(sl[len_char:])
                if len(new_slices) == 0:
                    return False
                slices = new_slices
            for sl in slices:
                if len(sl) == 0:
                    return True
            return False
        else:
            return False

Better Profanity

Profanity

__init__(words=None, special_cases=SPECIAL_CASES)

add_censor_words(custom_words)

censor(text, censor_char='*')

check_special_cases(text)

contains_profanity(text)

get_curse_words(text)

load_censor_words(custom_words=None, **kwargs)

load_censor_words_from_file(filename, **kwargs)

VaryingString

`Profanity`

`init(words=None, special_cases=SPECIAL_CASES)`

`add_censor_words(custom_words)`

`censor(text, censor_char='*')`

`check_special_cases(text)`

`contains_profanity(text)`

`get_curse_words(text)`

`load_censor_words(custom_words=None, **kwargs)`

`load_censor_words_from_file(filename, **kwargs)`

`VaryingString`