2019-07-30 15:58:01 +03:00
|
|
|
import spacy
|
2020-07-06 14:06:25 +03:00
|
|
|
from spacy.util import minibatch
|
|
|
|
from thinc.api import compounding
|
2020-07-06 14:02:36 +03:00
|
|
|
from spacy.gold import Example
|
2019-07-30 15:58:01 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_issue4030():
|
|
|
|
""" Test whether textcat works fine with empty doc """
|
|
|
|
unique_classes = ["offensive", "inoffensive"]
|
|
|
|
x_train = [
|
|
|
|
"This is an offensive text",
|
|
|
|
"This is the second offensive text",
|
|
|
|
"inoff",
|
|
|
|
]
|
|
|
|
y_train = ["offensive", "offensive", "inoffensive"]
|
|
|
|
|
|
|
|
nlp = spacy.blank("en")
|
|
|
|
|
2020-07-06 14:02:36 +03:00
|
|
|
# preparing the data
|
|
|
|
train_data = []
|
|
|
|
for text, train_instance in zip(x_train, y_train):
|
|
|
|
cat_dict = {label: label == train_instance for label in unique_classes}
|
|
|
|
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
|
|
|
|
|
|
|
# add a text categorizer component
|
2019-07-30 15:58:01 +03:00
|
|
|
textcat = nlp.create_pipe(
|
|
|
|
"textcat",
|
|
|
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
|
|
|
)
|
|
|
|
|
|
|
|
for label in unique_classes:
|
|
|
|
textcat.add_label(label)
|
|
|
|
nlp.add_pipe(textcat, last=True)
|
|
|
|
|
|
|
|
# training the network
|
2020-05-18 23:27:10 +03:00
|
|
|
with nlp.select_pipes(enable="textcat"):
|
2019-07-30 15:58:01 +03:00
|
|
|
optimizer = nlp.begin_training()
|
|
|
|
for i in range(3):
|
|
|
|
losses = {}
|
|
|
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
|
|
|
|
|
|
|
for batch in batches:
|
|
|
|
nlp.update(
|
2020-02-18 17:38:18 +03:00
|
|
|
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
2019-07-30 15:58:01 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
# processing of an empty doc should result in 0.0 for all categories
|
|
|
|
doc = nlp("")
|
|
|
|
assert doc.cats["offensive"] == 0.0
|
|
|
|
assert doc.cats["inoffensive"] == 0.0
|