Models

File implementing logic of the models

`HuggingFaceValidator`

Check fair housing violation with HuggingFace model

Source code in app/models.py

class HuggingFaceValidator:
    """
    Check fair housing violation with HuggingFace model
    """
    def __init__(self, model_path, pretrained_model):
        """
        Init HuggingFace model and tokenizer
        Parameters
        ----------
        model_path : str
            path to the trained HuggingFace model
        pretrained_model : str
            HuggingFace pretrained model for tokenizer
        """
        self.model = self.load_huggingface_model(model_path=model_path)
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model)

    @staticmethod
    def load_huggingface_model(model_path):
        """
        Load HuggingFace model from path
        Parameters
        ----------
        model_path : str
            The path to HuggingFace model

        Returns
        -------
        model :  model
            HuggingFace model
        """
        model = load_model(model_path)
        return model

    def sentences_encoder(self, sentences):
        """
        Tokenize sentences for HuggingFace transformer.
        Parameters
        ----------
        sentences : list
            List of sentences

        Returns
        -------
        ids : tensor
            tokens ids
        attention_masks : tensor
            tokens attention masks

        """
        max_length = self.model.input_shape[0][1]
        inputs = self.tokenizer(sentences, max_length=max_length, padding='max_length', truncation=True,
                                return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
        ids = inputs['input_ids']
        attention_masks = inputs['attention_mask']
        return ids, attention_masks

    def validate(self, sentences):
        """
        Function for calculating predictions on a set of text.

        Parameters
        ----------
        sentences : list
            List of sentences in remark

        Returns
        -------
        probabilities : list
            Probability of each sentence prediction
        """

        input_ids, input_attention_masks = self.sentences_encoder(sentences)

        probabilities = self.model.predict([input_ids, input_attention_masks])


        return probabilities

`init(model_path, pretrained_model)`

Init HuggingFace model and tokenizer

Parameters:

Name	Type	Description	Default
`model_path`	`str`	path to the trained HuggingFace model	required
`pretrained_model`	`str`	HuggingFace pretrained model for tokenizer	required

Source code in app/models.py

def __init__(self, model_path, pretrained_model):
    """
    Init HuggingFace model and tokenizer
    Parameters
    ----------
    model_path : str
        path to the trained HuggingFace model
    pretrained_model : str
        HuggingFace pretrained model for tokenizer
    """
    self.model = self.load_huggingface_model(model_path=model_path)
    self.tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model)

`load_huggingface_model(model_path)` `staticmethod`

Load HuggingFace model from path

Parameters:

Name	Type	Description	Default
`model_path`	`str`	The path to HuggingFace model	required

Returns:

Name	Type	Description
`model`	`model`	HuggingFace model

Source code in app/models.py

@staticmethod
def load_huggingface_model(model_path):
    """
    Load HuggingFace model from path
    Parameters
    ----------
    model_path : str
        The path to HuggingFace model

    Returns
    -------
    model :  model
        HuggingFace model
    """
    model = load_model(model_path)
    return model

`sentences_encoder(sentences)`

Tokenize sentences for HuggingFace transformer.

Parameters:

Name	Type	Description	Default
`sentences`	`list`	List of sentences	required

Returns:

Name	Type	Description
`ids`	`tensor`	tokens ids
`attention_masks`	`tensor`	tokens attention masks

Source code in app/models.py

def sentences_encoder(self, sentences):
    """
    Tokenize sentences for HuggingFace transformer.
    Parameters
    ----------
    sentences : list
        List of sentences

    Returns
    -------
    ids : tensor
        tokens ids
    attention_masks : tensor
        tokens attention masks

    """
    max_length = self.model.input_shape[0][1]
    inputs = self.tokenizer(sentences, max_length=max_length, padding='max_length', truncation=True,
                            return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
    ids = inputs['input_ids']
    attention_masks = inputs['attention_mask']
    return ids, attention_masks

`validate(sentences)`

Function for calculating predictions on a set of text.

Parameters:

Name	Type	Description	Default
`sentences`	`list`	List of sentences in remark	required

Returns:

Name	Type	Description
`probabilities`	`list`	Probability of each sentence prediction

Source code in app/models.py

def validate(self, sentences):
    """
    Function for calculating predictions on a set of text.

    Parameters
    ----------
    sentences : list
        List of sentences in remark

    Returns
    -------
    probabilities : list
        Probability of each sentence prediction
    """

    input_ids, input_attention_masks = self.sentences_encoder(sentences)

    probabilities = self.model.predict([input_ids, input_attention_masks])


    return probabilities

`Preprocessor`

Preprocessor for preprocessing input text and sentence segmentation.

Source code in app/models.py

class Preprocessor:
    """
    Preprocessor for preprocessing input text and sentence segmentation.
    """

    def __init__(self):
        """
        Init re for html tags and spacy.sentencizer
        """
        self.cleanr = re.compile(CLEANR)
        self.sentencizer = Sentencizer()

    def preprocess_text(self, text):
        """
        Preprocess raw text.
        Remove html tags
        Parameters
        ----------
        text : str
            input text

        Returns
        -------
        clean_text : str
            text without html tags
        """

        # remove HTML tags, whitespaces, tabs and special symbols
        clean_text = re.sub(self.cleanr, '', text)
        clean_text = re.sub('\n|\t|\r', ' ', clean_text)
        clean_text = re.sub('[^A-Za-z0-9 ]+', '', clean_text)

        # replace multiple whitespaces to single one
        clean_text = re.sub(' +', ' ', clean_text)
        return clean_text

    def get_sentences(self, text, lowercase=False):
        """
        Extract sentences from text
        Parameters
        ----------
        text : str
            input text
        lowercase : bool
            lowercase sentences if true

        Returns
        -------
        sentences : list
            Initial sentences in list
        clean_sentences : list
            Preprocessed list of sentences
        locations : list
            Locations of the sentences in remark
        """
        remark = text
        matches = re.finditer('[!\.\*\?][a-zA-Z]', text)
        i = 0
        for match in matches:
            if match.end() < len(remark):
                if remark[match.end()] == ".":
                    continue
            if remark[match.start() - 2:match.end()] != "$#!T":
                text = text[:match.start() + i] + \
                       text[match.start() + i:match.end() + i].replace(match.group(), " ".join(match.group())) + \
                       text[match.end() + i:]

                i += 1
        doc = nlp(text)
        sentences = []
        locations = []
        clean_sentences = []

        for sentence in doc.sents:

            sentences.append(str(sentence))
            sent_start = remark.find(str(sentence))

            sent_end = sent_start + len(str(sentence))
            if lowercase:
                sent = str(sentence).lower()
            else:
                sent = str(sentence)
            clean_sentences.append(self.preprocess_text(sent))
            locations.append({"start": sent_start, "end": sent_end})

        return sentences, clean_sentences, locations

`init()`

Init re for html tags and spacy.sentencizer

Source code in app/models.py

def __init__(self):
    """
    Init re for html tags and spacy.sentencizer
    """
    self.cleanr = re.compile(CLEANR)
    self.sentencizer = Sentencizer()

`get_sentences(text, lowercase=False)`

Extract sentences from text

Parameters:

Name	Type	Description	Default
`text`	`str`	input text	required
`lowercase`	`bool`	lowercase sentences if true	`False`

Returns:

Name	Type	Description
`sentences`	`list`	Initial sentences in list
`clean_sentences`	`list`	Preprocessed list of sentences
`locations`	`list`	Locations of the sentences in remark

Source code in app/models.py

def get_sentences(self, text, lowercase=False):
    """
    Extract sentences from text
    Parameters
    ----------
    text : str
        input text
    lowercase : bool
        lowercase sentences if true

    Returns
    -------
    sentences : list
        Initial sentences in list
    clean_sentences : list
        Preprocessed list of sentences
    locations : list
        Locations of the sentences in remark
    """
    remark = text
    matches = re.finditer('[!\.\*\?][a-zA-Z]', text)
    i = 0
    for match in matches:
        if match.end() < len(remark):
            if remark[match.end()] == ".":
                continue
        if remark[match.start() - 2:match.end()] != "$#!T":
            text = text[:match.start() + i] + \
                   text[match.start() + i:match.end() + i].replace(match.group(), " ".join(match.group())) + \
                   text[match.end() + i:]

            i += 1
    doc = nlp(text)
    sentences = []
    locations = []
    clean_sentences = []

    for sentence in doc.sents:

        sentences.append(str(sentence))
        sent_start = remark.find(str(sentence))

        sent_end = sent_start + len(str(sentence))
        if lowercase:
            sent = str(sentence).lower()
        else:
            sent = str(sentence)
        clean_sentences.append(self.preprocess_text(sent))
        locations.append({"start": sent_start, "end": sent_end})

    return sentences, clean_sentences, locations

`preprocess_text(text)`

Preprocess raw text. Remove html tags

Parameters:

Name	Type	Description	Default
`text`	`str`	input text	required

Returns:

Name	Type	Description
`clean_text`	`str`	text without html tags

Source code in app/models.py

def preprocess_text(self, text):
    """
    Preprocess raw text.
    Remove html tags
    Parameters
    ----------
    text : str
        input text

    Returns
    -------
    clean_text : str
        text without html tags
    """

    # remove HTML tags, whitespaces, tabs and special symbols
    clean_text = re.sub(self.cleanr, '', text)
    clean_text = re.sub('\n|\t|\r', ' ', clean_text)
    clean_text = re.sub('[^A-Za-z0-9 ]+', '', clean_text)

    # replace multiple whitespaces to single one
    clean_text = re.sub(' +', ' ', clean_text)
    return clean_text

`ProfanityValidator`

Check banned words

Source code in app/models.py

class ProfanityValidator:
    """
    Check banned words
    """

    def __init__(self, censor_words_path=config.CENSORED_WORLDS_PATH):
        """
        Init profanity lib with banned words file.
        Parameters
        ----------
        censor_words_path : str
            path of the censor words.
        """
        # if path is none load from file
        if censor_words_path:
            self.profanity = Profanity(words=censor_words_path)

    @staticmethod
    def is_contain_numbers(word):
        # to filter small words with numbers ex B1
        if any(char.isdigit() for char in word) and len(word) < 3:
            return True
        # check words with numbers
        elif all(char.isdigit() for char in word):
            return True
        else:
            return False

    def validate(self, sentences):
        """
        Validate if possible censored words are available.
        Parameters
        ----------
        sentences : list[str]
            input list of sentences

        Returns
        -------
        predictions : np.ndarray[int]
            list of binary predictions
        words : list[str]
            profanity words

        """

        predictions = np.zeros(len(sentences))
        words = []
        for i, sentence in enumerate(sentences):
            curse_words = self.profanity.get_curse_words(sentence)

            if len(curse_words):
                for cw in curse_words:
                    if not self.is_contain_numbers(cw):
                        words.append(cw)
                        predictions[i] = 0
            else:
                predictions[i] = 1
        return predictions, words

`init(censor_words_path=config.CENSORED_WORLDS_PATH)`

Init profanity lib with banned words file.

Parameters:

Name	Type	Description	Default
`censor_words_path`	`str`	path of the censor words.	`CENSORED_WORLDS_PATH`

Source code in app/models.py

def __init__(self, censor_words_path=config.CENSORED_WORLDS_PATH):
    """
    Init profanity lib with banned words file.
    Parameters
    ----------
    censor_words_path : str
        path of the censor words.
    """
    # if path is none load from file
    if censor_words_path:
        self.profanity = Profanity(words=censor_words_path)

`validate(sentences)`

Validate if possible censored words are available.

Parameters:

Name	Type	Description	Default
`sentences`	`list[str]`	input list of sentences	required

Returns:

Name	Type	Description
`predictions`	`ndarray[int]`	list of binary predictions
`words`	`list[str]`	profanity words

Source code in app/models.py

def validate(self, sentences):
    """
    Validate if possible censored words are available.
    Parameters
    ----------
    sentences : list[str]
        input list of sentences

    Returns
    -------
    predictions : np.ndarray[int]
        list of binary predictions
    words : list[str]
        profanity words

    """

    predictions = np.zeros(len(sentences))
    words = []
    for i, sentence in enumerate(sentences):
        curse_words = self.profanity.get_curse_words(sentence)

        if len(curse_words):
            for cw in curse_words:
                if not self.is_contain_numbers(cw):
                    words.append(cw)
                    predictions[i] = 0
        else:
            predictions[i] = 1
    return predictions, words

`RuleBasedValidator`

Validating text with spacy matcher.

Source code in app/models.py

class RuleBasedValidator:
    """
    Validating text with spacy matcher.
    """

    def __init__(self):
        """
        Init spacy Matcher
        """
        self.matcher = Matcher(nlp.vocab)
        self.construct_patterns()

    def construct_patterns(self):
        """
        Construct violation patterns.
        competitors patterns - case sensitive
        strict_violations - not case sensitive

        """
        # for urduc in SUPPORTED_URDUCS:
        #     urduc_data = violations.urduc_competitors
        #     competitors = urduc_data[urduc - 1].competitors
        #     self.matcher.add(f'competitors_{urduc}',
        #                      get_nested_pattern(violations=competitors),
        #                      )
        self.matcher.add('strict_violations',
                         get_nested_pattern(violations=violations.fair_housing_violations, case_sensitive=False),
                         )

    @staticmethod
    def collect_matches(matches, doc, patterns=None):
        """
        Collected tokens from matches
        Parameters
        ----------

        matches : Union[List[Tuple[int, int, int]], List[Span]]
            list of matches with match_id and indices
        doc : spacy doc object
            spacy doc object for sentence
        patterns : list[str]
            list of strings for collecting specific patterns

        Returns
        -------
        matched_phrases : list[str]
            matched phrases
        """
        if patterns is None:
            patterns = ['strict_violations', 'competitors']
        matched_phrases = []
        if isinstance(matches, list):
            for match_id, start, end in matches:
                string_id = doc.vocab.strings[match_id]
                if string_id in patterns:
                    phrase = doc[start:end].text
                    matched_phrases.append(phrase)
            return matched_phrases
        else:
            raise ValueError

    def validate(self, sentences, urduc):
        """
        Validate input sentences
        Parameters
        ----------
        urduc : int
            identifier for urduc
        sentences : list[str]
            input list of sentences

        Returns
        -------
        result : list
            list of binary predictions
        """
        results = []
        all_matched_phrases = []
        patterns_to_collect = ['strict_violations', f'competitors_{urduc}']
        for sent in sentences:
            doc = nlp(sent)
            matches = self.matcher(doc)
            if not len(matches):
                results.append(1)
            else:
                doc_matches = self.collect_matches(matches=matches, doc=doc, patterns=patterns_to_collect)
                all_matched_phrases.extend(doc_matches)
                results.append(0)
        return results, all_matched_phrases

    def check_competitors(self, sentences, urduc):
        """
        Validate input sentences
        Parameters
        ----------
        urduc : int
            identifier for urduc
        sentences : list[str]
            input list of sentences

        Returns
        -------
        predictions : list[int],
            list of binary predictions
        names : list[int]
            list of company names
        """
        predictions, names = [], []
        for sent in sentences:
            doc = nlp(sent)
            matches = self.matcher(doc)
            if not len(matches):
                predictions.append(0)
            else:
                patterns = [f'competitors_{urduc}']
                names_in_doc = self.collect_matches(matches=matches, doc=doc, patterns=patterns)
                predictions.append(1)
                names.extend(names_in_doc)

        return predictions, names

    @staticmethod
    def find_word_indices(sentence, word):
        """
        Find word all occurrences in the sentence with generator.
        Parameters
        ----------
        sentence : str
            input sentence
        word : str
            word to search

        Returns
        -------
            python generator object
        """
        start = 0
        while True:
            start = sentence.find(word, start)
            if start == -1: return
            yield start
            start += len(word)

    def check_name_variation(self, sentence, variation, max_length):
        """
        Check if pattern exist in the given sentence.
        Function takes tokens of the pattern and check if increasing sequence of occurrence is exist.

        Parameters
        ----------
        sentence : str
            input sentence
        variation : list[str]
            list of strings (competitor pattern)
        max_length : int
            number for filtering sequences where np.diff(seq)<max_length

        Returns
        -------
        indices : list[int]
            indices of the pattern
        """
        all_indices = []
        for i in range(len(variation)):
            word = variation[i]
            word_indices = list(self.find_word_indices(sentence=sentence, word=word))
            if len(word_indices) == 0:
                return []
            else:
                all_indices.append(word_indices)
        valid_subsequences = []
        start_word_indices = all_indices[0]
        for start_ind in start_word_indices:
            subseq = [start_ind]
            is_valid = False
            for i, word_indices in enumerate(all_indices[1:]):
                # if increasing index is added search for new one
                if len(subseq) >= i + 1:
                    # find higher index in second indices
                    where = np.where(np.array(word_indices) > subseq[i])[0]
                    for w_ind in where:
                        if 0 < (word_indices[w_ind] - subseq[i]) < max_length:
                            subseq.append(word_indices[w_ind])
                            if len(subseq) == len(variation):
                                is_valid = True
                        if is_valid:
                            break
                    if is_valid:
                        break
            valid_subsequences.append(subseq)

        for subseq in valid_subsequences:
            if len(subseq) == len(variation):
                return subseq
        else:
            return []

    def validate_competitors(self, sentences, urduc=2):
        """
        Validate competitors
        Parameters
        ----------
        sentences: list[str]
            input sentences
        urduc: int
            enum for urduc

        Returns
        -------
        predictions : list[int],
            predictions for each sentence
        names : list[str]
            banned words
        """

        competitors = violations.urduc_competitors[urduc - 1].competitors
        competitors = {comp.name: (split_list_of_strings(comp.variations), comp.max_length) for comp in competitors}
        predictions = []
        names = []

        for sent in sentences:
            pred_per_sentence = []
            names_per_sentence = []
            for competitor_name, (variations, max_length) in competitors.items():
                for variation in variations:
                    variation_indices = self.check_name_variation(sentence=sent,
                                                                  variation=variation,
                                                                  max_length=max_length)
                    if len(variation_indices):
                        pred_per_sentence.append(0)
                        names_per_sentence.append(competitor_name)
                    else:
                        pred_per_sentence.append(1)
            predictions.append(min(pred_per_sentence))
            names.extend(names_per_sentence)

        return predictions, names

`init()`

Init spacy Matcher

Source code in app/models.py

def __init__(self):
    """
    Init spacy Matcher
    """
    self.matcher = Matcher(nlp.vocab)
    self.construct_patterns()

`check_competitors(sentences, urduc)`

Validate input sentences

Parameters:

Name	Type	Description	Default
`urduc`	`int`	identifier for urduc	required
`sentences`	`list[str]`	input list of sentences	required

Returns:

Name	Type	Description
`predictions`	`(list[int])`	list of binary predictions
`names`	`list[int]`	list of company names

Source code in app/models.py

def check_competitors(self, sentences, urduc):
    """
    Validate input sentences
    Parameters
    ----------
    urduc : int
        identifier for urduc
    sentences : list[str]
        input list of sentences

    Returns
    -------
    predictions : list[int],
        list of binary predictions
    names : list[int]
        list of company names
    """
    predictions, names = [], []
    for sent in sentences:
        doc = nlp(sent)
        matches = self.matcher(doc)
        if not len(matches):
            predictions.append(0)
        else:
            patterns = [f'competitors_{urduc}']
            names_in_doc = self.collect_matches(matches=matches, doc=doc, patterns=patterns)
            predictions.append(1)
            names.extend(names_in_doc)

    return predictions, names

`check_name_variation(sentence, variation, max_length)`

Check if pattern exist in the given sentence. Function takes tokens of the pattern and check if increasing sequence of occurrence is exist.

Parameters:

Name	Type	Description	Default
`sentence`	`str`	input sentence	required
`variation`	`list[str]`	list of strings (competitor pattern)	required
`max_length`	`int`	number for filtering sequences where np.diff(seq)<max_length	required

Returns:

Name	Type	Description
`indices`	`list[int]`	indices of the pattern

Source code in app/models.py

def check_name_variation(self, sentence, variation, max_length):
    """
    Check if pattern exist in the given sentence.
    Function takes tokens of the pattern and check if increasing sequence of occurrence is exist.

    Parameters
    ----------
    sentence : str
        input sentence
    variation : list[str]
        list of strings (competitor pattern)
    max_length : int
        number for filtering sequences where np.diff(seq)<max_length

    Returns
    -------
    indices : list[int]
        indices of the pattern
    """
    all_indices = []
    for i in range(len(variation)):
        word = variation[i]
        word_indices = list(self.find_word_indices(sentence=sentence, word=word))
        if len(word_indices) == 0:
            return []
        else:
            all_indices.append(word_indices)
    valid_subsequences = []
    start_word_indices = all_indices[0]
    for start_ind in start_word_indices:
        subseq = [start_ind]
        is_valid = False
        for i, word_indices in enumerate(all_indices[1:]):
            # if increasing index is added search for new one
            if len(subseq) >= i + 1:
                # find higher index in second indices
                where = np.where(np.array(word_indices) > subseq[i])[0]
                for w_ind in where:
                    if 0 < (word_indices[w_ind] - subseq[i]) < max_length:
                        subseq.append(word_indices[w_ind])
                        if len(subseq) == len(variation):
                            is_valid = True
                    if is_valid:
                        break
                if is_valid:
                    break
        valid_subsequences.append(subseq)

    for subseq in valid_subsequences:
        if len(subseq) == len(variation):
            return subseq
    else:
        return []

`collect_matches(matches, doc, patterns=None)` `staticmethod`

Collected tokens from matches

Parameters:

Name	Type	Description	Default
`matches`	`Union[List[Tuple[int, int, int]], List[Span]]`	list of matches with match_id and indices	required
`doc`	`spacy doc object`	spacy doc object for sentence	required
`patterns`	`list[str]`	list of strings for collecting specific patterns	`None`

Returns:

Name	Type	Description
`matched_phrases`	`list[str]`	matched phrases

Source code in app/models.py

@staticmethod
def collect_matches(matches, doc, patterns=None):
    """
    Collected tokens from matches
    Parameters
    ----------

    matches : Union[List[Tuple[int, int, int]], List[Span]]
        list of matches with match_id and indices
    doc : spacy doc object
        spacy doc object for sentence
    patterns : list[str]
        list of strings for collecting specific patterns

    Returns
    -------
    matched_phrases : list[str]
        matched phrases
    """
    if patterns is None:
        patterns = ['strict_violations', 'competitors']
    matched_phrases = []
    if isinstance(matches, list):
        for match_id, start, end in matches:
            string_id = doc.vocab.strings[match_id]
            if string_id in patterns:
                phrase = doc[start:end].text
                matched_phrases.append(phrase)
        return matched_phrases
    else:
        raise ValueError

`construct_patterns()`

Construct violation patterns. competitors patterns - case sensitive strict_violations - not case sensitive

Source code in app/models.py

def construct_patterns(self):
    """
    Construct violation patterns.
    competitors patterns - case sensitive
    strict_violations - not case sensitive

    """
    # for urduc in SUPPORTED_URDUCS:
    #     urduc_data = violations.urduc_competitors
    #     competitors = urduc_data[urduc - 1].competitors
    #     self.matcher.add(f'competitors_{urduc}',
    #                      get_nested_pattern(violations=competitors),
    #                      )
    self.matcher.add('strict_violations',
                     get_nested_pattern(violations=violations.fair_housing_violations, case_sensitive=False),
                     )

`find_word_indices(sentence, word)` `staticmethod`

Find word all occurrences in the sentence with generator.

Parameters:

Name	Type	Description	Default
`sentence`	`str`	input sentence	required
`word`	`str`	word to search	required

Returns:

Type	Description
`python generator object`

Source code in app/models.py

@staticmethod
def find_word_indices(sentence, word):
    """
    Find word all occurrences in the sentence with generator.
    Parameters
    ----------
    sentence : str
        input sentence
    word : str
        word to search

    Returns
    -------
        python generator object
    """
    start = 0
    while True:
        start = sentence.find(word, start)
        if start == -1: return
        yield start
        start += len(word)

`validate(sentences, urduc)`

Validate input sentences

Parameters:

Name	Type	Description	Default
`urduc`	`int`	identifier for urduc	required
`sentences`	`list[str]`	input list of sentences	required

Returns:

Name	Type	Description
`result`	`list`	list of binary predictions

Source code in app/models.py

def validate(self, sentences, urduc):
    """
    Validate input sentences
    Parameters
    ----------
    urduc : int
        identifier for urduc
    sentences : list[str]
        input list of sentences

    Returns
    -------
    result : list
        list of binary predictions
    """
    results = []
    all_matched_phrases = []
    patterns_to_collect = ['strict_violations', f'competitors_{urduc}']
    for sent in sentences:
        doc = nlp(sent)
        matches = self.matcher(doc)
        if not len(matches):
            results.append(1)
        else:
            doc_matches = self.collect_matches(matches=matches, doc=doc, patterns=patterns_to_collect)
            all_matched_phrases.extend(doc_matches)
            results.append(0)
    return results, all_matched_phrases

`validate_competitors(sentences, urduc=2)`

Validate competitors

Parameters:

Name	Type	Description	Default
`sentences`		input sentences	required
`urduc`		enum for urduc	`2`

Returns:

Name	Type	Description
`predictions`	`(list[int])`	predictions for each sentence
`names`	`list[str]`	banned words

Source code in app/models.py

def validate_competitors(self, sentences, urduc=2):
    """
    Validate competitors
    Parameters
    ----------
    sentences: list[str]
        input sentences
    urduc: int
        enum for urduc

    Returns
    -------
    predictions : list[int],
        predictions for each sentence
    names : list[str]
        banned words
    """

    competitors = violations.urduc_competitors[urduc - 1].competitors
    competitors = {comp.name: (split_list_of_strings(comp.variations), comp.max_length) for comp in competitors}
    predictions = []
    names = []

    for sent in sentences:
        pred_per_sentence = []
        names_per_sentence = []
        for competitor_name, (variations, max_length) in competitors.items():
            for variation in variations:
                variation_indices = self.check_name_variation(sentence=sent,
                                                              variation=variation,
                                                              max_length=max_length)
                if len(variation_indices):
                    pred_per_sentence.append(0)
                    names_per_sentence.append(competitor_name)
                else:
                    pred_per_sentence.append(1)
        predictions.append(min(pred_per_sentence))
        names.extend(names_per_sentence)

    return predictions, names

`SpacyValidator`

Source code in app/models.py

class SpacyValidator:
    def __init__(self, model_path):
        self.model = self.load_spacy_model(model_path=model_path)

    @staticmethod
    def load_spacy_model(model_path):
        """
        Load spacy model from path
        Parameters
        ----------
        model_path : str
            The path of spacy model

        Returns
        -------
        model : spacy model
            model
        """
        model = spacy.load(model_path)
        return model

    def validate(self, sentences, threshold):
        """
        Function for calculating predictions on a set of text.

        Parameters
        ----------
        sentences : list
            List of sentences in remark
        threshold : float
            threshold of the violation prediction

        Returns
        -------
        bool_predictions : list
            Prediction of each sentence in boolean format
        probabilities : list
            Probability of each prediction
        """

        predictions = [i.cats for i in self.model.pipe(sentences)]
        bool_predictions = [i['POSITIVE'] < threshold for i in predictions]
        probabilities = [i['POSITIVE'] for i in predictions]
        return bool_predictions, probabilities

`load_spacy_model(model_path)` `staticmethod`

Load spacy model from path

Parameters:

Name	Type	Description	Default
`model_path`	`str`	The path of spacy model	required

Returns:

Name	Type	Description
`model`	`spacy model`	model

Source code in app/models.py

@staticmethod
def load_spacy_model(model_path):
    """
    Load spacy model from path
    Parameters
    ----------
    model_path : str
        The path of spacy model

    Returns
    -------
    model : spacy model
        model
    """
    model = spacy.load(model_path)
    return model

`validate(sentences, threshold)`

Function for calculating predictions on a set of text.

Parameters:

Name	Type	Description	Default
`sentences`	`list`	List of sentences in remark	required
`threshold`	`float`	threshold of the violation prediction	required

Returns:

Name	Type	Description
`bool_predictions`	`list`	Prediction of each sentence in boolean format
`probabilities`	`list`	Probability of each prediction

Source code in app/models.py

def validate(self, sentences, threshold):
    """
    Function for calculating predictions on a set of text.

    Parameters
    ----------
    sentences : list
        List of sentences in remark
    threshold : float
        threshold of the violation prediction

    Returns
    -------
    bool_predictions : list
        Prediction of each sentence in boolean format
    probabilities : list
        Probability of each prediction
    """

    predictions = [i.cats for i in self.model.pipe(sentences)]
    bool_predictions = [i['POSITIVE'] < threshold for i in predictions]
    probabilities = [i['POSITIVE'] for i in predictions]
    return bool_predictions, probabilities

`Validator`

Class for interacting with the spacy model+strict violations

Source code in app/models.py

class Validator:
    """
    Class for interacting with the spacy model+strict violations
    """

    def __init__(self, model_path=config.SPACY_MODEL_PATH, censored_words_path=config.CENSORED_WORLDS_PATH):
        # Load the spacy model
        # self.spacy_validator = SpacyValidator(model_path=model_path)

        # init profanity checker
        self.profanity_model = ProfanityValidator(censor_words_path=censored_words_path)

        # init strict violations matcher
        self.rule_based_validator = RuleBasedValidator()

    def validate(self, sentences, clean_sentences, urduc, threshold=config.PREDICTION_THRESHOLD):
        """
        Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator
        Parameters
        ----------
        urduc: int
            identifier for urduc
        sentences: list[str]
            list of input sentences
        clean_sentences: list[str]
            list of preprocessed sentences
        threshold: float
            threshold for calculating predictions

        Returns
        -------
        final_predictions : list[bool]
            1 if sentence is valid
        final_probabilities : list[float]
            the probability of validation
        violations : list[str]
            list of violations
        """

        # pass through ViolationsMatcher
        predictions_rb, violations_rb = self.rule_based_validator.validate_competitors(sentences=clean_sentences,
                                                                                       urduc=urduc)
        probabilities_rb = [1 - pred for pred in predictions_rb]

        # check strict violations
        predictions_strict, violations_strict = self.rule_based_validator.validate(sentences=sentences, urduc=urduc)
        probabilities_strict = [1 - pred for pred in predictions_strict]

        # pass through library for profanity
        predictions_pf, violations_pf = self.profanity_model.validate(sentences=sentences)
        probabilities_pf = [1 - pred for pred in predictions_pf]

        # -------------- commented only for resolving conflict with spacy transformer -----------
        # predict with spacy model
        # model_predictions, probabilities = self.spacy_validator.validate(sentences=clean_sentences, threshold=threshold)

        # combine the results for output
        # final_predictions = np.array([predictions_rb, predictions_pf, predictions_strict, model_predictions]).min(
        #     axis=0)
        # final_probabilities = np.array([probabilities_rb, probabilities_pf, probabilities_strict, probabilities]).max(
        #     axis=0)
        # -------------- commented only for resolving conflict with spacy transformer -----------

        final_predictions = np.array([predictions_rb, predictions_pf, predictions_strict]).min(
            axis=0)
        final_probabilities = np.array([probabilities_rb, probabilities_pf, probabilities_strict]).max(
            axis=0)

        violations = set(violations_rb + violations_pf + violations_strict)

        return final_predictions, final_probabilities, violations

    def validate_competitors_and_profanity(self, preprocessed_sentences, text, urduc):
        """
        Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator
        Parameters
        ----------
        preprocessed_sentences : list[str]
            list of preprocessed sentences
        urduc : int
            identifier for urduc
        text : str
            input text (not preprocessed)

        Returns
        -------
        pred : bool
            0 for violation
        violations : set[str]
            set of violations


        """
        # pass through ViolationsMatcher
        predictions_rb, violations_rb = self.rule_based_validator.validate_competitors(sentences=preprocessed_sentences,
                                                                                       urduc=urduc)

        # check strict violations
        predictions_strict, violations_strict = self.rule_based_validator.validate(sentences=[text],
                                                                                   urduc=urduc)

        # pass through library for profanity
        predictions_pf, violations_pf = self.profanity_model.validate(sentences=[text])

        # combine violation tokens
        violations = set(violations_rb + violations_pf + violations_strict)
        # 0 for violation
        pred = np.min([np.min(predictions_strict), np.min(predictions_pf), np.min(predictions_rb)])

        return pred, violations

`validate(sentences, clean_sentences, urduc, threshold=config.PREDICTION_THRESHOLD)`

Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator

Parameters:

Name	Description	Default
`urduc`	identifier for urduc	required
`sentences`	list of input sentences	required
`clean_sentences`	list of preprocessed sentences	required
`threshold`	threshold for calculating predictions	`PREDICTION_THRESHOLD`

Returns:

Name	Type	Description
`final_predictions`	`list[bool]`	1 if sentence is valid
`final_probabilities`	`list[float]`	the probability of validation
`violations`	`list[str]`	list of violations

Source code in app/models.py

def validate(self, sentences, clean_sentences, urduc, threshold=config.PREDICTION_THRESHOLD):
    """
    Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator
    Parameters
    ----------
    urduc: int
        identifier for urduc
    sentences: list[str]
        list of input sentences
    clean_sentences: list[str]
        list of preprocessed sentences
    threshold: float
        threshold for calculating predictions

    Returns
    -------
    final_predictions : list[bool]
        1 if sentence is valid
    final_probabilities : list[float]
        the probability of validation
    violations : list[str]
        list of violations
    """

    # pass through ViolationsMatcher
    predictions_rb, violations_rb = self.rule_based_validator.validate_competitors(sentences=clean_sentences,
                                                                                   urduc=urduc)
    probabilities_rb = [1 - pred for pred in predictions_rb]

    # check strict violations
    predictions_strict, violations_strict = self.rule_based_validator.validate(sentences=sentences, urduc=urduc)
    probabilities_strict = [1 - pred for pred in predictions_strict]

    # pass through library for profanity
    predictions_pf, violations_pf = self.profanity_model.validate(sentences=sentences)
    probabilities_pf = [1 - pred for pred in predictions_pf]

    # -------------- commented only for resolving conflict with spacy transformer -----------
    # predict with spacy model
    # model_predictions, probabilities = self.spacy_validator.validate(sentences=clean_sentences, threshold=threshold)

    # combine the results for output
    # final_predictions = np.array([predictions_rb, predictions_pf, predictions_strict, model_predictions]).min(
    #     axis=0)
    # final_probabilities = np.array([probabilities_rb, probabilities_pf, probabilities_strict, probabilities]).max(
    #     axis=0)
    # -------------- commented only for resolving conflict with spacy transformer -----------

    final_predictions = np.array([predictions_rb, predictions_pf, predictions_strict]).min(
        axis=0)
    final_probabilities = np.array([probabilities_rb, probabilities_pf, probabilities_strict]).max(
        axis=0)

    violations = set(violations_rb + violations_pf + violations_strict)

    return final_predictions, final_probabilities, violations

`validate_competitors_and_profanity(preprocessed_sentences, text, urduc)`

Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator

Parameters:

Name	Type	Description	Default
`preprocessed_sentences`	`list[str]`	list of preprocessed sentences	required
`urduc`	`int`	identifier for urduc	required
`text`	`str`	input text (not preprocessed)	required

Returns:

Name	Type	Description
`pred`	`bool`	0 for violation
`violations`	`set[str]`	set of violations

Source code in app/models.py

def validate_competitors_and_profanity(self, preprocessed_sentences, text, urduc):
    """
    Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator
    Parameters
    ----------
    preprocessed_sentences : list[str]
        list of preprocessed sentences
    urduc : int
        identifier for urduc
    text : str
        input text (not preprocessed)

    Returns
    -------
    pred : bool
        0 for violation
    violations : set[str]
        set of violations


    """
    # pass through ViolationsMatcher
    predictions_rb, violations_rb = self.rule_based_validator.validate_competitors(sentences=preprocessed_sentences,
                                                                                   urduc=urduc)

    # check strict violations
    predictions_strict, violations_strict = self.rule_based_validator.validate(sentences=[text],
                                                                               urduc=urduc)

    # pass through library for profanity
    predictions_pf, violations_pf = self.profanity_model.validate(sentences=[text])

    # combine violation tokens
    violations = set(violations_rb + violations_pf + violations_strict)
    # 0 for violation
    pred = np.min([np.min(predictions_strict), np.min(predictions_pf), np.min(predictions_rb)])

    return pred, violations

`get_nested_pattern(violations, case_sensitive=True)`

Construct spacy patterns from the list of strings. Flag case_sensitive will be constructing case-sensitive patters.

Parameters:

Name	Type	Description	Default
`violations`	`list[str]`	input list of strings	required
`case_sensitive`	`bool`	True for case-sensitive patters.	`True`

Returns:

Name	Type	Description
`pattern`	`list[list[dict[str, any]]]`	constructed spacy pattern

Source code in app/models.py

def get_nested_pattern(violations, case_sensitive=True):
    """
    Construct spacy patterns from the list of strings.
    Flag case_sensitive will be constructing case-sensitive patters.
    Parameters
    ----------
    violations : list[str]
        input list of strings
    case_sensitive : bool
        True for case-sensitive patters.

    Returns
    -------
    pattern : list[list[dict[str, any]]]
        constructed spacy pattern

    """
    violations_split = split_list_of_strings(strings=violations, tokenize=True)
    pattern = []
    for violation in violations_split:
        viol_pattern = []
        for item in violation:
            if case_sensitive:
                item_pattern = {'TEXT': item}
            else:
                item_pattern = {'LOWER': item.text.lower()}
            viol_pattern.append(item_pattern)
        pattern.append(viol_pattern)
    return pattern

`split_list_of_strings(strings, tokenize=False)`

Split list of strings to tokens

Parameters:

Name	Type	Description	Default
`tokenize`	`bool`	tokenize with Spacy or split with spaces	`False`
`strings`	`list[str]`	input list of strings	required

Returns:

Name	Type	Description
`result`	`list[list[str]]`	nested list with split items

Source code in app/models.py

def split_list_of_strings(strings, tokenize=False):
    """
    Split list of strings to tokens
    Parameters
    ----------
    tokenize : bool
        tokenize with Spacy or split with spaces
    strings : list[str]
        input list of strings

    Returns
    -------
    result : list[list[str]]
        nested list with split items
    """
    if not tokenize:
        return [s.split(' ') for s in strings]
    else:
        return [[x for x in nlp(s)] for s in strings]

Models

HuggingFaceValidator

__init__(model_path, pretrained_model)

load_huggingface_model(model_path) staticmethod

sentences_encoder(sentences)

validate(sentences)

Preprocessor

__init__()

get_sentences(text, lowercase=False)

preprocess_text(text)

ProfanityValidator

__init__(censor_words_path=config.CENSORED_WORLDS_PATH)

validate(sentences)

RuleBasedValidator

__init__()

check_competitors(sentences, urduc)

check_name_variation(sentence, variation, max_length)

collect_matches(matches, doc, patterns=None) staticmethod

construct_patterns()

find_word_indices(sentence, word) staticmethod

validate(sentences, urduc)

validate_competitors(sentences, urduc=2)

SpacyValidator

load_spacy_model(model_path) staticmethod

validate(sentences, threshold)

Validator

validate(sentences, clean_sentences, urduc, threshold=config.PREDICTION_THRESHOLD)

validate_competitors_and_profanity(preprocessed_sentences, text, urduc)

get_nested_pattern(violations, case_sensitive=True)

split_list_of_strings(strings, tokenize=False)

`HuggingFaceValidator`

`init(model_path, pretrained_model)`

`load_huggingface_model(model_path)` `staticmethod`

`sentences_encoder(sentences)`

`validate(sentences)`

`Preprocessor`

`init()`

`get_sentences(text, lowercase=False)`

`preprocess_text(text)`

`ProfanityValidator`

`init(censor_words_path=config.CENSORED_WORLDS_PATH)`

`validate(sentences)`

`RuleBasedValidator`

`init()`

`check_competitors(sentences, urduc)`

`check_name_variation(sentence, variation, max_length)`

`collect_matches(matches, doc, patterns=None)` `staticmethod`

`construct_patterns()`

`find_word_indices(sentence, word)` `staticmethod`

`validate(sentences, urduc)`

`validate_competitors(sentences, urduc=2)`

`SpacyValidator`

`load_spacy_model(model_path)` `staticmethod`

`validate(sentences, threshold)`

`Validator`

`validate(sentences, clean_sentences, urduc, threshold=config.PREDICTION_THRESHOLD)`

`validate_competitors_and_profanity(preprocessed_sentences, text, urduc)`

`get_nested_pattern(violations, case_sensitive=True)`

`split_list_of_strings(strings, tokenize=False)`