Skip to content

Commit

Permalink
Implemented toxicity filter
Browse files Browse the repository at this point in the history
  • Loading branch information
saqibns committed Jul 11, 2021
1 parent 53227f8 commit 08291c7
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 0 deletions.
21 changes: 21 additions & 0 deletions filters/toxicity/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## Toxicity filter

## What type of a filter is this?

This filter filters an example text which has a toxicity value matching a particular threshold.
Author: Saqib N. Shamsi

Toxicity labels supported:
* ToxicityTypes.TOXICITY
* ToxicityTypes.SEVERE_TOXICITY
* ToxicityTypes.OBSCENE
* ToxicityTypes.IDENTITY_ATTACK
* ToxicityTypes.INSULT
* ToxicityTypes.THREAT
* ToxicityTypes.SEXUAL_EXPLICIT

## Why is this filter important?
Language generation models which have been trained on data with hate speech can learn to produce samples containing hate speech.

## What are the limitations of this filter?
Since this model uses a model which was trained for toxicity detection, the quality of data the model was trained on and model's performance would affect the accuracy with which toxic text would be detected and filtered.
1 change: 1 addition & 0 deletions filters/toxicity/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .filter import *
60 changes: 60 additions & 0 deletions filters/toxicity/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import enum
import operator

from detoxify import Detoxify

from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType


class ToxicityTypes(str, enum.Enum):
TOXICITY = "toxicity"
SEVERE_TOXICITY = "severe_toxicity"
OBSCENE = "obscene"
IDENTITY_ATTACK = "identity_attack"
INSULT = "insult"
THREAT = "threat"
SEXUAL_EXPLICIT = "sexual_explicit"


class ToxicityFilter(SentenceOperation):
tasks = [TaskType.TEXT_TO_TEXT_GENERATION]
languages = ["en"]

def __init__(
self,
toxicity_type: ToxicityTypes,
op: str = ">",
threshold: float = 0.5,
):
super().__init__()

self.check_threshold_value(threshold)

self.type = toxicity_type
self.operator = self.parse_operator(op)
self.threshold = threshold
self.unbiased_model = Detoxify("unbiased")

@staticmethod
def parse_operator(op):
ops = {
">": operator.gt,
"<": operator.lt,
">=": operator.ge,
"<=": operator.le,
"==": operator.eq,
}
return ops[op]

@staticmethod
def check_threshold_value(threshold):
if threshold < 0 or threshold > 1:
raise ValueError(
f"Threshold must be in the range [0, 1]. {threshold} provided."
)

def filter(self, sentence: str = None) -> bool:
predictions = self.unbiased_model.predict(sentence)
toxicity_value = predictions[self.type]
return self.operator(toxicity_value, self.threshold)
77 changes: 77 additions & 0 deletions filters/toxicity/test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"type": "toxicity",
"test_cases": [
{
"class": "ToxicityFilter",
"args": {
"toxicity_type": "insult",
"op": ">=",
"threshold": 0.8
},
"inputs": {
"sentence": "What a moron!"
},
"outputs": true
},
{
"class": "ToxicityFilter",
"args": {
"toxicity_type": "toxicity",
"op": ">=",
"threshold": 0.9
},
"inputs": {
"sentence": "I disagree. It is not supposed to work that way."
},
"outputs": false
},
{
"class": "ToxicityFilter",
"args": {
"toxicity_type": "toxicity",
"op": ">",
"threshold": 0.8
},
"inputs": {
"sentence": "Shut up, faggot!"
},
"outputs": true
},
{
"class": "ToxicityFilter",
"args": {
"toxicity_type": "insult",
"op": "<",
"threshold": 0.75
},
"inputs": {
"sentence": "I am going to come right there and beat you into a pulp!"
},
"outputs": true
},
{
"class": "ToxicityFilter",
"args": {
"toxicity_type": "threat",
"op": "<=",
"threshold": 0.75
},
"inputs": {
"sentence": "I am going to come right there and beat you into a pulp!"
},
"outputs": false
},
{
"class": "ToxicityFilter",
"args": {
"toxicity_type": "toxicity",
"op": "<=",
"threshold": 0.5
},
"inputs": {
"sentence": "You are really clever"
},
"outputs": true
}
]
}
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ black==21.5b2
pre-commit==2.13.0
flake8==3.9.2
isort==5.9.1

# for toxicity detection
detoxify==0.2.2

0 comments on commit 08291c7

Please sign in to comment.