Skip to content

Models

File implementing logic of the models

HuggingFaceValidator

Check fair housing violation with HuggingFace model

Source code in app/models.py
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
class HuggingFaceValidator:
    """
    Check fair housing violation with HuggingFace model
    """
    def __init__(self, model_path, pretrained_model):
        """
        Init HuggingFace model and tokenizer
        Parameters
        ----------
        model_path : str
            path to the trained HuggingFace model
        pretrained_model : str
            HuggingFace pretrained model for tokenizer
        """
        self.model = self.load_huggingface_model(model_path=model_path)
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model)

    @staticmethod
    def load_huggingface_model(model_path):
        """
        Load HuggingFace model from path
        Parameters
        ----------
        model_path : str
            The path to HuggingFace model

        Returns
        -------
        model :  model
            HuggingFace model
        """
        model = load_model(model_path)
        return model

    def sentences_encoder(self, sentences):
        """
        Tokenize sentences for HuggingFace transformer.
        Parameters
        ----------
        sentences : list
            List of sentences

        Returns
        -------
        ids : tensor
            tokens ids
        attention_masks : tensor
            tokens attention masks

        """
        max_length = self.model.input_shape[0][1]
        inputs = self.tokenizer(sentences, max_length=max_length, padding='max_length', truncation=True,
                                return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
        ids = inputs['input_ids']
        attention_masks = inputs['attention_mask']
        return ids, attention_masks

    def validate(self, sentences):
        """
        Function for calculating predictions on a set of text.

        Parameters
        ----------
        sentences : list
            List of sentences in remark

        Returns
        -------
        probabilities : list
            Probability of each sentence prediction
        """

        input_ids, input_attention_masks = self.sentences_encoder(sentences)

        probabilities = self.model.predict([input_ids, input_attention_masks])


        return probabilities

__init__(model_path, pretrained_model)

Init HuggingFace model and tokenizer

Parameters:

Name Type Description Default
model_path str

path to the trained HuggingFace model

required
pretrained_model str

HuggingFace pretrained model for tokenizer

required
Source code in app/models.py
510
511
512
513
514
515
516
517
518
519
520
521
def __init__(self, model_path, pretrained_model):
    """
    Init HuggingFace model and tokenizer
    Parameters
    ----------
    model_path : str
        path to the trained HuggingFace model
    pretrained_model : str
        HuggingFace pretrained model for tokenizer
    """
    self.model = self.load_huggingface_model(model_path=model_path)
    self.tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model)

load_huggingface_model(model_path) staticmethod

Load HuggingFace model from path

Parameters:

Name Type Description Default
model_path str

The path to HuggingFace model

required

Returns:

Name Type Description
model model

HuggingFace model

Source code in app/models.py
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
@staticmethod
def load_huggingface_model(model_path):
    """
    Load HuggingFace model from path
    Parameters
    ----------
    model_path : str
        The path to HuggingFace model

    Returns
    -------
    model :  model
        HuggingFace model
    """
    model = load_model(model_path)
    return model

sentences_encoder(sentences)

Tokenize sentences for HuggingFace transformer.

Parameters:

Name Type Description Default
sentences list

List of sentences

required

Returns:

Name Type Description
ids tensor

tokens ids

attention_masks tensor

tokens attention masks

Source code in app/models.py
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
def sentences_encoder(self, sentences):
    """
    Tokenize sentences for HuggingFace transformer.
    Parameters
    ----------
    sentences : list
        List of sentences

    Returns
    -------
    ids : tensor
        tokens ids
    attention_masks : tensor
        tokens attention masks

    """
    max_length = self.model.input_shape[0][1]
    inputs = self.tokenizer(sentences, max_length=max_length, padding='max_length', truncation=True,
                            return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
    ids = inputs['input_ids']
    attention_masks = inputs['attention_mask']
    return ids, attention_masks

validate(sentences)

Function for calculating predictions on a set of text.

Parameters:

Name Type Description Default
sentences list

List of sentences in remark

required

Returns:

Name Type Description
probabilities list

Probability of each sentence prediction

Source code in app/models.py
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
def validate(self, sentences):
    """
    Function for calculating predictions on a set of text.

    Parameters
    ----------
    sentences : list
        List of sentences in remark

    Returns
    -------
    probabilities : list
        Probability of each sentence prediction
    """

    input_ids, input_attention_masks = self.sentences_encoder(sentences)

    probabilities = self.model.predict([input_ids, input_attention_masks])


    return probabilities

Preprocessor

Preprocessor for preprocessing input text and sentence segmentation.

Source code in app/models.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class Preprocessor:
    """
    Preprocessor for preprocessing input text and sentence segmentation.
    """

    def __init__(self):
        """
        Init re for html tags and spacy.sentencizer
        """
        self.cleanr = re.compile(CLEANR)
        self.sentencizer = Sentencizer()

    def preprocess_text(self, text):
        """
        Preprocess raw text.
        Remove html tags
        Parameters
        ----------
        text : str
            input text

        Returns
        -------
        clean_text : str
            text without html tags
        """

        # remove HTML tags, whitespaces, tabs and special symbols
        clean_text = re.sub(self.cleanr, '', text)
        clean_text = re.sub('\n|\t|\r', ' ', clean_text)
        clean_text = re.sub('[^A-Za-z0-9 ]+', '', clean_text)

        # replace multiple whitespaces to single one
        clean_text = re.sub(' +', ' ', clean_text)
        return clean_text

    def get_sentences(self, text, lowercase=False):
        """
        Extract sentences from text
        Parameters
        ----------
        text : str
            input text
        lowercase : bool
            lowercase sentences if true

        Returns
        -------
        sentences : list
            Initial sentences in list
        clean_sentences : list
            Preprocessed list of sentences
        locations : list
            Locations of the sentences in remark
        """
        remark = text
        matches = re.finditer('[!\.\*\?][a-zA-Z]', text)
        i = 0
        for match in matches:
            if match.end() < len(remark):
                if remark[match.end()] == ".":
                    continue
            if remark[match.start() - 2:match.end()] != "$#!T":
                text = text[:match.start() + i] + \
                       text[match.start() + i:match.end() + i].replace(match.group(), " ".join(match.group())) + \
                       text[match.end() + i:]

                i += 1
        doc = nlp(text)
        sentences = []
        locations = []
        clean_sentences = []

        for sentence in doc.sents:

            sentences.append(str(sentence))
            sent_start = remark.find(str(sentence))

            sent_end = sent_start + len(str(sentence))
            if lowercase:
                sent = str(sentence).lower()
            else:
                sent = str(sentence)
            clean_sentences.append(self.preprocess_text(sent))
            locations.append({"start": sent_start, "end": sent_end})

        return sentences, clean_sentences, locations

__init__()

Init re for html tags and spacy.sentencizer

Source code in app/models.py
25
26
27
28
29
30
def __init__(self):
    """
    Init re for html tags and spacy.sentencizer
    """
    self.cleanr = re.compile(CLEANR)
    self.sentencizer = Sentencizer()

get_sentences(text, lowercase=False)

Extract sentences from text

Parameters:

Name Type Description Default
text str

input text

required
lowercase bool

lowercase sentences if true

False

Returns:

Name Type Description
sentences list

Initial sentences in list

clean_sentences list

Preprocessed list of sentences

locations list

Locations of the sentences in remark

Source code in app/models.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def get_sentences(self, text, lowercase=False):
    """
    Extract sentences from text
    Parameters
    ----------
    text : str
        input text
    lowercase : bool
        lowercase sentences if true

    Returns
    -------
    sentences : list
        Initial sentences in list
    clean_sentences : list
        Preprocessed list of sentences
    locations : list
        Locations of the sentences in remark
    """
    remark = text
    matches = re.finditer('[!\.\*\?][a-zA-Z]', text)
    i = 0
    for match in matches:
        if match.end() < len(remark):
            if remark[match.end()] == ".":
                continue
        if remark[match.start() - 2:match.end()] != "$#!T":
            text = text[:match.start() + i] + \
                   text[match.start() + i:match.end() + i].replace(match.group(), " ".join(match.group())) + \
                   text[match.end() + i:]

            i += 1
    doc = nlp(text)
    sentences = []
    locations = []
    clean_sentences = []

    for sentence in doc.sents:

        sentences.append(str(sentence))
        sent_start = remark.find(str(sentence))

        sent_end = sent_start + len(str(sentence))
        if lowercase:
            sent = str(sentence).lower()
        else:
            sent = str(sentence)
        clean_sentences.append(self.preprocess_text(sent))
        locations.append({"start": sent_start, "end": sent_end})

    return sentences, clean_sentences, locations

preprocess_text(text)

Preprocess raw text. Remove html tags

Parameters:

Name Type Description Default
text str

input text

required

Returns:

Name Type Description
clean_text str

text without html tags

Source code in app/models.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def preprocess_text(self, text):
    """
    Preprocess raw text.
    Remove html tags
    Parameters
    ----------
    text : str
        input text

    Returns
    -------
    clean_text : str
        text without html tags
    """

    # remove HTML tags, whitespaces, tabs and special symbols
    clean_text = re.sub(self.cleanr, '', text)
    clean_text = re.sub('\n|\t|\r', ' ', clean_text)
    clean_text = re.sub('[^A-Za-z0-9 ]+', '', clean_text)

    # replace multiple whitespaces to single one
    clean_text = re.sub(' +', ' ', clean_text)
    return clean_text

ProfanityValidator

Check banned words

Source code in app/models.py
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
class ProfanityValidator:
    """
    Check banned words
    """

    def __init__(self, censor_words_path=config.CENSORED_WORLDS_PATH):
        """
        Init profanity lib with banned words file.
        Parameters
        ----------
        censor_words_path : str
            path of the censor words.
        """
        # if path is none load from file
        if censor_words_path:
            self.profanity = Profanity(words=censor_words_path)

    @staticmethod
    def is_contain_numbers(word):
        # to filter small words with numbers ex B1
        if any(char.isdigit() for char in word) and len(word) < 3:
            return True
        # check words with numbers
        elif all(char.isdigit() for char in word):
            return True
        else:
            return False

    def validate(self, sentences):
        """
        Validate if possible censored words are available.
        Parameters
        ----------
        sentences : list[str]
            input list of sentences

        Returns
        -------
        predictions : np.ndarray[int]
            list of binary predictions
        words : list[str]
            profanity words

        """

        predictions = np.zeros(len(sentences))
        words = []
        for i, sentence in enumerate(sentences):
            curse_words = self.profanity.get_curse_words(sentence)

            if len(curse_words):
                for cw in curse_words:
                    if not self.is_contain_numbers(cw):
                        words.append(cw)
                        predictions[i] = 0
            else:
                predictions[i] = 1
        return predictions, words

__init__(censor_words_path=config.CENSORED_WORLDS_PATH)

Init profanity lib with banned words file.

Parameters:

Name Type Description Default
censor_words_path str

path of the censor words.

CENSORED_WORLDS_PATH
Source code in app/models.py
405
406
407
408
409
410
411
412
413
414
415
def __init__(self, censor_words_path=config.CENSORED_WORLDS_PATH):
    """
    Init profanity lib with banned words file.
    Parameters
    ----------
    censor_words_path : str
        path of the censor words.
    """
    # if path is none load from file
    if censor_words_path:
        self.profanity = Profanity(words=censor_words_path)

validate(sentences)

Validate if possible censored words are available.

Parameters:

Name Type Description Default
sentences list[str]

input list of sentences

required

Returns:

Name Type Description
predictions ndarray[int]

list of binary predictions

words list[str]

profanity words

Source code in app/models.py
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
def validate(self, sentences):
    """
    Validate if possible censored words are available.
    Parameters
    ----------
    sentences : list[str]
        input list of sentences

    Returns
    -------
    predictions : np.ndarray[int]
        list of binary predictions
    words : list[str]
        profanity words

    """

    predictions = np.zeros(len(sentences))
    words = []
    for i, sentence in enumerate(sentences):
        curse_words = self.profanity.get_curse_words(sentence)

        if len(curse_words):
            for cw in curse_words:
                if not self.is_contain_numbers(cw):
                    words.append(cw)
                    predictions[i] = 0
        else:
            predictions[i] = 1
    return predictions, words

RuleBasedValidator

Validating text with spacy matcher.

Source code in app/models.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
class RuleBasedValidator:
    """
    Validating text with spacy matcher.
    """

    def __init__(self):
        """
        Init spacy Matcher
        """
        self.matcher = Matcher(nlp.vocab)
        self.construct_patterns()

    def construct_patterns(self):
        """
        Construct violation patterns.
        competitors patterns - case sensitive
        strict_violations - not case sensitive

        """
        # for urduc in SUPPORTED_URDUCS:
        #     urduc_data = violations.urduc_competitors
        #     competitors = urduc_data[urduc - 1].competitors
        #     self.matcher.add(f'competitors_{urduc}',
        #                      get_nested_pattern(violations=competitors),
        #                      )
        self.matcher.add('strict_violations',
                         get_nested_pattern(violations=violations.fair_housing_violations, case_sensitive=False),
                         )

    @staticmethod
    def collect_matches(matches, doc, patterns=None):
        """
        Collected tokens from matches
        Parameters
        ----------

        matches : Union[List[Tuple[int, int, int]], List[Span]]
            list of matches with match_id and indices
        doc : spacy doc object
            spacy doc object for sentence
        patterns : list[str]
            list of strings for collecting specific patterns

        Returns
        -------
        matched_phrases : list[str]
            matched phrases
        """
        if patterns is None:
            patterns = ['strict_violations', 'competitors']
        matched_phrases = []
        if isinstance(matches, list):
            for match_id, start, end in matches:
                string_id = doc.vocab.strings[match_id]
                if string_id in patterns:
                    phrase = doc[start:end].text
                    matched_phrases.append(phrase)
            return matched_phrases
        else:
            raise ValueError

    def validate(self, sentences, urduc):
        """
        Validate input sentences
        Parameters
        ----------
        urduc : int
            identifier for urduc
        sentences : list[str]
            input list of sentences

        Returns
        -------
        result : list
            list of binary predictions
        """
        results = []
        all_matched_phrases = []
        patterns_to_collect = ['strict_violations', f'competitors_{urduc}']
        for sent in sentences:
            doc = nlp(sent)
            matches = self.matcher(doc)
            if not len(matches):
                results.append(1)
            else:
                doc_matches = self.collect_matches(matches=matches, doc=doc, patterns=patterns_to_collect)
                all_matched_phrases.extend(doc_matches)
                results.append(0)
        return results, all_matched_phrases

    def check_competitors(self, sentences, urduc):
        """
        Validate input sentences
        Parameters
        ----------
        urduc : int
            identifier for urduc
        sentences : list[str]
            input list of sentences

        Returns
        -------
        predictions : list[int],
            list of binary predictions
        names : list[int]
            list of company names
        """
        predictions, names = [], []
        for sent in sentences:
            doc = nlp(sent)
            matches = self.matcher(doc)
            if not len(matches):
                predictions.append(0)
            else:
                patterns = [f'competitors_{urduc}']
                names_in_doc = self.collect_matches(matches=matches, doc=doc, patterns=patterns)
                predictions.append(1)
                names.extend(names_in_doc)

        return predictions, names

    @staticmethod
    def find_word_indices(sentence, word):
        """
        Find word all occurrences in the sentence with generator.
        Parameters
        ----------
        sentence : str
            input sentence
        word : str
            word to search

        Returns
        -------
            python generator object
        """
        start = 0
        while True:
            start = sentence.find(word, start)
            if start == -1: return
            yield start
            start += len(word)

    def check_name_variation(self, sentence, variation, max_length):
        """
        Check if pattern exist in the given sentence.
        Function takes tokens of the pattern and check if increasing sequence of occurrence is exist.

        Parameters
        ----------
        sentence : str
            input sentence
        variation : list[str]
            list of strings (competitor pattern)
        max_length : int
            number for filtering sequences where np.diff(seq)<max_length

        Returns
        -------
        indices : list[int]
            indices of the pattern
        """
        all_indices = []
        for i in range(len(variation)):
            word = variation[i]
            word_indices = list(self.find_word_indices(sentence=sentence, word=word))
            if len(word_indices) == 0:
                return []
            else:
                all_indices.append(word_indices)
        valid_subsequences = []
        start_word_indices = all_indices[0]
        for start_ind in start_word_indices:
            subseq = [start_ind]
            is_valid = False
            for i, word_indices in enumerate(all_indices[1:]):
                # if increasing index is added search for new one
                if len(subseq) >= i + 1:
                    # find higher index in second indices
                    where = np.where(np.array(word_indices) > subseq[i])[0]
                    for w_ind in where:
                        if 0 < (word_indices[w_ind] - subseq[i]) < max_length:
                            subseq.append(word_indices[w_ind])
                            if len(subseq) == len(variation):
                                is_valid = True
                        if is_valid:
                            break
                    if is_valid:
                        break
            valid_subsequences.append(subseq)

        for subseq in valid_subsequences:
            if len(subseq) == len(variation):
                return subseq
        else:
            return []

    def validate_competitors(self, sentences, urduc=2):
        """
        Validate competitors
        Parameters
        ----------
        sentences: list[str]
            input sentences
        urduc: int
            enum for urduc

        Returns
        -------
        predictions : list[int],
            predictions for each sentence
        names : list[str]
            banned words
        """

        competitors = violations.urduc_competitors[urduc - 1].competitors
        competitors = {comp.name: (split_list_of_strings(comp.variations), comp.max_length) for comp in competitors}
        predictions = []
        names = []

        for sent in sentences:
            pred_per_sentence = []
            names_per_sentence = []
            for competitor_name, (variations, max_length) in competitors.items():
                for variation in variations:
                    variation_indices = self.check_name_variation(sentence=sent,
                                                                  variation=variation,
                                                                  max_length=max_length)
                    if len(variation_indices):
                        pred_per_sentence.append(0)
                        names_per_sentence.append(competitor_name)
                    else:
                        pred_per_sentence.append(1)
            predictions.append(min(pred_per_sentence))
            names.extend(names_per_sentence)

        return predictions, names

__init__()

Init spacy Matcher

Source code in app/models.py
166
167
168
169
170
171
def __init__(self):
    """
    Init spacy Matcher
    """
    self.matcher = Matcher(nlp.vocab)
    self.construct_patterns()

check_competitors(sentences, urduc)

Validate input sentences

Parameters:

Name Type Description Default
urduc int

identifier for urduc

required
sentences list[str]

input list of sentences

required

Returns:

Name Type Description
predictions (list[int])

list of binary predictions

names list[int]

list of company names

Source code in app/models.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def check_competitors(self, sentences, urduc):
    """
    Validate input sentences
    Parameters
    ----------
    urduc : int
        identifier for urduc
    sentences : list[str]
        input list of sentences

    Returns
    -------
    predictions : list[int],
        list of binary predictions
    names : list[int]
        list of company names
    """
    predictions, names = [], []
    for sent in sentences:
        doc = nlp(sent)
        matches = self.matcher(doc)
        if not len(matches):
            predictions.append(0)
        else:
            patterns = [f'competitors_{urduc}']
            names_in_doc = self.collect_matches(matches=matches, doc=doc, patterns=patterns)
            predictions.append(1)
            names.extend(names_in_doc)

    return predictions, names

check_name_variation(sentence, variation, max_length)

Check if pattern exist in the given sentence. Function takes tokens of the pattern and check if increasing sequence of occurrence is exist.

Parameters:

Name Type Description Default
sentence str

input sentence

required
variation list[str]

list of strings (competitor pattern)

required
max_length int

number for filtering sequences where np.diff(seq)<max_length

required

Returns:

Name Type Description
indices list[int]

indices of the pattern

Source code in app/models.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def check_name_variation(self, sentence, variation, max_length):
    """
    Check if pattern exist in the given sentence.
    Function takes tokens of the pattern and check if increasing sequence of occurrence is exist.

    Parameters
    ----------
    sentence : str
        input sentence
    variation : list[str]
        list of strings (competitor pattern)
    max_length : int
        number for filtering sequences where np.diff(seq)<max_length

    Returns
    -------
    indices : list[int]
        indices of the pattern
    """
    all_indices = []
    for i in range(len(variation)):
        word = variation[i]
        word_indices = list(self.find_word_indices(sentence=sentence, word=word))
        if len(word_indices) == 0:
            return []
        else:
            all_indices.append(word_indices)
    valid_subsequences = []
    start_word_indices = all_indices[0]
    for start_ind in start_word_indices:
        subseq = [start_ind]
        is_valid = False
        for i, word_indices in enumerate(all_indices[1:]):
            # if increasing index is added search for new one
            if len(subseq) >= i + 1:
                # find higher index in second indices
                where = np.where(np.array(word_indices) > subseq[i])[0]
                for w_ind in where:
                    if 0 < (word_indices[w_ind] - subseq[i]) < max_length:
                        subseq.append(word_indices[w_ind])
                        if len(subseq) == len(variation):
                            is_valid = True
                    if is_valid:
                        break
                if is_valid:
                    break
        valid_subsequences.append(subseq)

    for subseq in valid_subsequences:
        if len(subseq) == len(variation):
            return subseq
    else:
        return []

collect_matches(matches, doc, patterns=None) staticmethod

Collected tokens from matches

Parameters:

Name Type Description Default
matches Union[List[Tuple[int, int, int]], List[Span]]

list of matches with match_id and indices

required
doc spacy doc object

spacy doc object for sentence

required
patterns list[str]

list of strings for collecting specific patterns

None

Returns:

Name Type Description
matched_phrases list[str]

matched phrases

Source code in app/models.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
@staticmethod
def collect_matches(matches, doc, patterns=None):
    """
    Collected tokens from matches
    Parameters
    ----------

    matches : Union[List[Tuple[int, int, int]], List[Span]]
        list of matches with match_id and indices
    doc : spacy doc object
        spacy doc object for sentence
    patterns : list[str]
        list of strings for collecting specific patterns

    Returns
    -------
    matched_phrases : list[str]
        matched phrases
    """
    if patterns is None:
        patterns = ['strict_violations', 'competitors']
    matched_phrases = []
    if isinstance(matches, list):
        for match_id, start, end in matches:
            string_id = doc.vocab.strings[match_id]
            if string_id in patterns:
                phrase = doc[start:end].text
                matched_phrases.append(phrase)
        return matched_phrases
    else:
        raise ValueError

construct_patterns()

Construct violation patterns. competitors patterns - case sensitive strict_violations - not case sensitive

Source code in app/models.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def construct_patterns(self):
    """
    Construct violation patterns.
    competitors patterns - case sensitive
    strict_violations - not case sensitive

    """
    # for urduc in SUPPORTED_URDUCS:
    #     urduc_data = violations.urduc_competitors
    #     competitors = urduc_data[urduc - 1].competitors
    #     self.matcher.add(f'competitors_{urduc}',
    #                      get_nested_pattern(violations=competitors),
    #                      )
    self.matcher.add('strict_violations',
                     get_nested_pattern(violations=violations.fair_housing_violations, case_sensitive=False),
                     )

find_word_indices(sentence, word) staticmethod

Find word all occurrences in the sentence with generator.

Parameters:

Name Type Description Default
sentence str

input sentence

required
word str

word to search

required

Returns:

Type Description
python generator object
Source code in app/models.py
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
@staticmethod
def find_word_indices(sentence, word):
    """
    Find word all occurrences in the sentence with generator.
    Parameters
    ----------
    sentence : str
        input sentence
    word : str
        word to search

    Returns
    -------
        python generator object
    """
    start = 0
    while True:
        start = sentence.find(word, start)
        if start == -1: return
        yield start
        start += len(word)

validate(sentences, urduc)

Validate input sentences

Parameters:

Name Type Description Default
urduc int

identifier for urduc

required
sentences list[str]

input list of sentences

required

Returns:

Name Type Description
result list

list of binary predictions

Source code in app/models.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def validate(self, sentences, urduc):
    """
    Validate input sentences
    Parameters
    ----------
    urduc : int
        identifier for urduc
    sentences : list[str]
        input list of sentences

    Returns
    -------
    result : list
        list of binary predictions
    """
    results = []
    all_matched_phrases = []
    patterns_to_collect = ['strict_violations', f'competitors_{urduc}']
    for sent in sentences:
        doc = nlp(sent)
        matches = self.matcher(doc)
        if not len(matches):
            results.append(1)
        else:
            doc_matches = self.collect_matches(matches=matches, doc=doc, patterns=patterns_to_collect)
            all_matched_phrases.extend(doc_matches)
            results.append(0)
    return results, all_matched_phrases

validate_competitors(sentences, urduc=2)

Validate competitors

Parameters:

Name Type Description Default
sentences

input sentences

required
urduc

enum for urduc

2

Returns:

Name Type Description
predictions (list[int])

predictions for each sentence

names list[str]

banned words

Source code in app/models.py
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
def validate_competitors(self, sentences, urduc=2):
    """
    Validate competitors
    Parameters
    ----------
    sentences: list[str]
        input sentences
    urduc: int
        enum for urduc

    Returns
    -------
    predictions : list[int],
        predictions for each sentence
    names : list[str]
        banned words
    """

    competitors = violations.urduc_competitors[urduc - 1].competitors
    competitors = {comp.name: (split_list_of_strings(comp.variations), comp.max_length) for comp in competitors}
    predictions = []
    names = []

    for sent in sentences:
        pred_per_sentence = []
        names_per_sentence = []
        for competitor_name, (variations, max_length) in competitors.items():
            for variation in variations:
                variation_indices = self.check_name_variation(sentence=sent,
                                                              variation=variation,
                                                              max_length=max_length)
                if len(variation_indices):
                    pred_per_sentence.append(0)
                    names_per_sentence.append(competitor_name)
                else:
                    pred_per_sentence.append(1)
        predictions.append(min(pred_per_sentence))
        names.extend(names_per_sentence)

    return predictions, names

SpacyValidator

Source code in app/models.py
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
class SpacyValidator:
    def __init__(self, model_path):
        self.model = self.load_spacy_model(model_path=model_path)

    @staticmethod
    def load_spacy_model(model_path):
        """
        Load spacy model from path
        Parameters
        ----------
        model_path : str
            The path of spacy model

        Returns
        -------
        model : spacy model
            model
        """
        model = spacy.load(model_path)
        return model

    def validate(self, sentences, threshold):
        """
        Function for calculating predictions on a set of text.

        Parameters
        ----------
        sentences : list
            List of sentences in remark
        threshold : float
            threshold of the violation prediction

        Returns
        -------
        bool_predictions : list
            Prediction of each sentence in boolean format
        probabilities : list
            Probability of each prediction
        """

        predictions = [i.cats for i in self.model.pipe(sentences)]
        bool_predictions = [i['POSITIVE'] < threshold for i in predictions]
        probabilities = [i['POSITIVE'] for i in predictions]
        return bool_predictions, probabilities

load_spacy_model(model_path) staticmethod

Load spacy model from path

Parameters:

Name Type Description Default
model_path str

The path of spacy model

required

Returns:

Name Type Description
model spacy model

model

Source code in app/models.py
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
@staticmethod
def load_spacy_model(model_path):
    """
    Load spacy model from path
    Parameters
    ----------
    model_path : str
        The path of spacy model

    Returns
    -------
    model : spacy model
        model
    """
    model = spacy.load(model_path)
    return model

validate(sentences, threshold)

Function for calculating predictions on a set of text.

Parameters:

Name Type Description Default
sentences list

List of sentences in remark

required
threshold float

threshold of the violation prediction

required

Returns:

Name Type Description
bool_predictions list

Prediction of each sentence in boolean format

probabilities list

Probability of each prediction

Source code in app/models.py
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
def validate(self, sentences, threshold):
    """
    Function for calculating predictions on a set of text.

    Parameters
    ----------
    sentences : list
        List of sentences in remark
    threshold : float
        threshold of the violation prediction

    Returns
    -------
    bool_predictions : list
        Prediction of each sentence in boolean format
    probabilities : list
        Probability of each prediction
    """

    predictions = [i.cats for i in self.model.pipe(sentences)]
    bool_predictions = [i['POSITIVE'] < threshold for i in predictions]
    probabilities = [i['POSITIVE'] for i in predictions]
    return bool_predictions, probabilities

Validator

Class for interacting with the spacy model+strict violations

Source code in app/models.py
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
class Validator:
    """
    Class for interacting with the spacy model+strict violations
    """

    def __init__(self, model_path=config.SPACY_MODEL_PATH, censored_words_path=config.CENSORED_WORLDS_PATH):
        # Load the spacy model
        # self.spacy_validator = SpacyValidator(model_path=model_path)

        # init profanity checker
        self.profanity_model = ProfanityValidator(censor_words_path=censored_words_path)

        # init strict violations matcher
        self.rule_based_validator = RuleBasedValidator()

    def validate(self, sentences, clean_sentences, urduc, threshold=config.PREDICTION_THRESHOLD):
        """
        Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator
        Parameters
        ----------
        urduc: int
            identifier for urduc
        sentences: list[str]
            list of input sentences
        clean_sentences: list[str]
            list of preprocessed sentences
        threshold: float
            threshold for calculating predictions

        Returns
        -------
        final_predictions : list[bool]
            1 if sentence is valid
        final_probabilities : list[float]
            the probability of validation
        violations : list[str]
            list of violations
        """

        # pass through ViolationsMatcher
        predictions_rb, violations_rb = self.rule_based_validator.validate_competitors(sentences=clean_sentences,
                                                                                       urduc=urduc)
        probabilities_rb = [1 - pred for pred in predictions_rb]

        # check strict violations
        predictions_strict, violations_strict = self.rule_based_validator.validate(sentences=sentences, urduc=urduc)
        probabilities_strict = [1 - pred for pred in predictions_strict]

        # pass through library for profanity
        predictions_pf, violations_pf = self.profanity_model.validate(sentences=sentences)
        probabilities_pf = [1 - pred for pred in predictions_pf]

        # -------------- commented only for resolving conflict with spacy transformer -----------
        # predict with spacy model
        # model_predictions, probabilities = self.spacy_validator.validate(sentences=clean_sentences, threshold=threshold)

        # combine the results for output
        # final_predictions = np.array([predictions_rb, predictions_pf, predictions_strict, model_predictions]).min(
        #     axis=0)
        # final_probabilities = np.array([probabilities_rb, probabilities_pf, probabilities_strict, probabilities]).max(
        #     axis=0)
        # -------------- commented only for resolving conflict with spacy transformer -----------

        final_predictions = np.array([predictions_rb, predictions_pf, predictions_strict]).min(
            axis=0)
        final_probabilities = np.array([probabilities_rb, probabilities_pf, probabilities_strict]).max(
            axis=0)

        violations = set(violations_rb + violations_pf + violations_strict)

        return final_predictions, final_probabilities, violations

    def validate_competitors_and_profanity(self, preprocessed_sentences, text, urduc):
        """
        Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator
        Parameters
        ----------
        preprocessed_sentences : list[str]
            list of preprocessed sentences
        urduc : int
            identifier for urduc
        text : str
            input text (not preprocessed)

        Returns
        -------
        pred : bool
            0 for violation
        violations : set[str]
            set of violations


        """
        # pass through ViolationsMatcher
        predictions_rb, violations_rb = self.rule_based_validator.validate_competitors(sentences=preprocessed_sentences,
                                                                                       urduc=urduc)

        # check strict violations
        predictions_strict, violations_strict = self.rule_based_validator.validate(sentences=[text],
                                                                                   urduc=urduc)

        # pass through library for profanity
        predictions_pf, violations_pf = self.profanity_model.validate(sentences=[text])

        # combine violation tokens
        violations = set(violations_rb + violations_pf + violations_strict)
        # 0 for violation
        pred = np.min([np.min(predictions_strict), np.min(predictions_pf), np.min(predictions_rb)])

        return pred, violations

validate(sentences, clean_sentences, urduc, threshold=config.PREDICTION_THRESHOLD)

Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator

Parameters:

Name Type Description Default
urduc

identifier for urduc

required
sentences

list of input sentences

required
clean_sentences

list of preprocessed sentences

required
threshold

threshold for calculating predictions

PREDICTION_THRESHOLD

Returns:

Name Type Description
final_predictions list[bool]

1 if sentence is valid

final_probabilities list[float]

the probability of validation

violations list[str]

list of violations

Source code in app/models.py
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
def validate(self, sentences, clean_sentences, urduc, threshold=config.PREDICTION_THRESHOLD):
    """
    Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator
    Parameters
    ----------
    urduc: int
        identifier for urduc
    sentences: list[str]
        list of input sentences
    clean_sentences: list[str]
        list of preprocessed sentences
    threshold: float
        threshold for calculating predictions

    Returns
    -------
    final_predictions : list[bool]
        1 if sentence is valid
    final_probabilities : list[float]
        the probability of validation
    violations : list[str]
        list of violations
    """

    # pass through ViolationsMatcher
    predictions_rb, violations_rb = self.rule_based_validator.validate_competitors(sentences=clean_sentences,
                                                                                   urduc=urduc)
    probabilities_rb = [1 - pred for pred in predictions_rb]

    # check strict violations
    predictions_strict, violations_strict = self.rule_based_validator.validate(sentences=sentences, urduc=urduc)
    probabilities_strict = [1 - pred for pred in predictions_strict]

    # pass through library for profanity
    predictions_pf, violations_pf = self.profanity_model.validate(sentences=sentences)
    probabilities_pf = [1 - pred for pred in predictions_pf]

    # -------------- commented only for resolving conflict with spacy transformer -----------
    # predict with spacy model
    # model_predictions, probabilities = self.spacy_validator.validate(sentences=clean_sentences, threshold=threshold)

    # combine the results for output
    # final_predictions = np.array([predictions_rb, predictions_pf, predictions_strict, model_predictions]).min(
    #     axis=0)
    # final_probabilities = np.array([probabilities_rb, probabilities_pf, probabilities_strict, probabilities]).max(
    #     axis=0)
    # -------------- commented only for resolving conflict with spacy transformer -----------

    final_predictions = np.array([predictions_rb, predictions_pf, predictions_strict]).min(
        axis=0)
    final_probabilities = np.array([probabilities_rb, probabilities_pf, probabilities_strict]).max(
        axis=0)

    violations = set(violations_rb + violations_pf + violations_strict)

    return final_predictions, final_probabilities, violations

validate_competitors_and_profanity(preprocessed_sentences, text, urduc)

Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator

Parameters:

Name Type Description Default
preprocessed_sentences list[str]

list of preprocessed sentences

required
urduc int

identifier for urduc

required
text str

input text (not preprocessed)

required

Returns:

Name Type Description
pred bool

0 for violation

violations set[str]

set of violations

Source code in app/models.py
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
def validate_competitors_and_profanity(self, preprocessed_sentences, text, urduc):
    """
    Validate sentences with RuleBasedValidator,RuleBasedValidator and SpacyValidator
    Parameters
    ----------
    preprocessed_sentences : list[str]
        list of preprocessed sentences
    urduc : int
        identifier for urduc
    text : str
        input text (not preprocessed)

    Returns
    -------
    pred : bool
        0 for violation
    violations : set[str]
        set of violations


    """
    # pass through ViolationsMatcher
    predictions_rb, violations_rb = self.rule_based_validator.validate_competitors(sentences=preprocessed_sentences,
                                                                                   urduc=urduc)

    # check strict violations
    predictions_strict, violations_strict = self.rule_based_validator.validate(sentences=[text],
                                                                               urduc=urduc)

    # pass through library for profanity
    predictions_pf, violations_pf = self.profanity_model.validate(sentences=[text])

    # combine violation tokens
    violations = set(violations_rb + violations_pf + violations_strict)
    # 0 for violation
    pred = np.min([np.min(predictions_strict), np.min(predictions_pf), np.min(predictions_rb)])

    return pred, violations

get_nested_pattern(violations, case_sensitive=True)

Construct spacy patterns from the list of strings. Flag case_sensitive will be constructing case-sensitive patters.

Parameters:

Name Type Description Default
violations list[str]

input list of strings

required
case_sensitive bool

True for case-sensitive patters.

True

Returns:

Name Type Description
pattern list[list[dict[str, any]]]

constructed spacy pattern

Source code in app/models.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def get_nested_pattern(violations, case_sensitive=True):
    """
    Construct spacy patterns from the list of strings.
    Flag case_sensitive will be constructing case-sensitive patters.
    Parameters
    ----------
    violations : list[str]
        input list of strings
    case_sensitive : bool
        True for case-sensitive patters.

    Returns
    -------
    pattern : list[list[dict[str, any]]]
        constructed spacy pattern

    """
    violations_split = split_list_of_strings(strings=violations, tokenize=True)
    pattern = []
    for violation in violations_split:
        viol_pattern = []
        for item in violation:
            if case_sensitive:
                item_pattern = {'TEXT': item}
            else:
                item_pattern = {'LOWER': item.text.lower()}
            viol_pattern.append(item_pattern)
        pattern.append(viol_pattern)
    return pattern

split_list_of_strings(strings, tokenize=False)

Split list of strings to tokens

Parameters:

Name Type Description Default
tokenize bool

tokenize with Spacy or split with spaces

False
strings list[str]

input list of strings

required

Returns:

Name Type Description
result list[list[str]]

nested list with split items

Source code in app/models.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def split_list_of_strings(strings, tokenize=False):
    """
    Split list of strings to tokens
    Parameters
    ----------
    tokenize : bool
        tokenize with Spacy or split with spaces
    strings : list[str]
        input list of strings

    Returns
    -------
    result : list[list[str]]
        nested list with split items
    """
    if not tokenize:
        return [s.split(' ') for s in strings]
    else:
        return [[x for x in nlp(s)] for s in strings]