Merge branch 'develop' into spacy.io

This commit is contained in:
Ines Montani 2019-02-24 18:35:32 +01:00
commit 948ca2bb3e
20 changed files with 450 additions and 244 deletions

View File

@ -4,7 +4,7 @@ import random
import srsly import srsly
import spacy import spacy
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.util import minibatch from spacy.util import minibatch, compounding
LABEL = "ANIMAL" LABEL = "ANIMAL"
@ -54,9 +54,17 @@ def main(model_name, unlabelled_loc):
nlp.get_pipe("ner").add_label(LABEL) nlp.get_pipe("ner").add_label(LABEL)
raw_docs = list(read_raw_data(nlp, unlabelled_loc)) raw_docs = list(read_raw_data(nlp, unlabelled_loc))
optimizer = nlp.resume_training() optimizer = nlp.resume_training()
# Avoid use of Adam when resuming training. I don't understand this well
# yet, but I'm getting weird results from Adam. Try commenting out the
# nlp.update(), and using Adam -- you'll find the models drift apart.
# I guess Adam is losing precision, introducing gradient noise?
optimizer.alpha = 0.1
optimizer.b1 = 0.0
optimizer.b2 = 0.0
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
sizes = compounding(1.0, 4.0, 1.001)
with nlp.disable_pipes(*other_pipes): with nlp.disable_pipes(*other_pipes):
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
@ -64,13 +72,22 @@ def main(model_name, unlabelled_loc):
losses = {} losses = {}
r_losses = {} r_losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
raw_batches = minibatch(raw_docs, size=batch_size) raw_batches = minibatch(raw_docs, size=4)
for doc, gold in TRAIN_DATA: for batch in minibatch(TRAIN_DATA, size=sizes):
nlp.update([doc], [gold], sgd=optimizer, drop=dropout, losses=losses) docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
raw_batch = list(next(raw_batches)) raw_batch = list(next(raw_batches))
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
print("Losses", losses) print("Losses", losses)
print("R. Losses", r_losses) print("R. Losses", r_losses)
print(nlp.get_pipe('ner').model.unseen_classes)
test_text = "Do you like horses?"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
print(ent.label_, ent.text)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -45,19 +45,19 @@ LABEL = "ANIMAL"
TRAIN_DATA = [ TRAIN_DATA = [
( (
"Horses are too tall and they pretend to care about your feelings", "Horses are too tall and they pretend to care about your feelings",
{"entities": [(0, 6, "ANIMAL")]}, {"entities": [(0, 6, LABEL)]},
), ),
("Do they bite?", {"entities": []}), ("Do they bite?", {"entities": []}),
( (
"horses are too tall and they pretend to care about your feelings", "horses are too tall and they pretend to care about your feelings",
{"entities": [(0, 6, "ANIMAL")]}, {"entities": [(0, 6, LABEL)]},
), ),
("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}), ("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}),
( (
"they pretend to care about your feelings, those horses", "they pretend to care about your feelings, those horses",
{"entities": [(48, 54, "ANIMAL")]}, {"entities": [(48, 54, LABEL)]},
), ),
("horses?", {"entities": [(0, 6, "ANIMAL")]}), ("horses?", {"entities": [(0, 6, LABEL)]}),
] ]
@ -67,8 +67,9 @@ TRAIN_DATA = [
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int), n_iter=("Number of training iterations", "option", "n", int),
) )
def main(model=None, new_model_name="animal", output_dir=None, n_iter=10): def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
"""Set up the pipeline and entity recognizer, and train the new entity.""" """Set up the pipeline and entity recognizer, and train the new entity."""
random.seed(0)
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
@ -85,21 +86,22 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
ner = nlp.get_pipe("ner") ner = nlp.get_pipe("ner")
ner.add_label(LABEL) # add new entity label to entity recognizer ner.add_label(LABEL) # add new entity label to entity recognizer
# Adding extraneous labels shouldn't mess anything up
ner.add_label('VEGETABLE')
if model is None: if model is None:
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
else: else:
# Note that 'begin_training' initializes the models, so it'll zero out optimizer = nlp.resume_training()
# existing entity types. move_names = list(ner.move_names)
optimizer = nlp.entity.create_optimizer()
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER with nlp.disable_pipes(*other_pipes): # only train NER
sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatch
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
batches = minibatch(TRAIN_DATA, size=sizes)
losses = {} losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
@ -124,6 +126,8 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
# test the saved model # test the saved model
print("Loading from", output_dir) print("Loading from", output_dir)
nlp2 = spacy.load(output_dir) nlp2 = spacy.load(output_dir)
# Check the classes have loaded back consistently
assert nlp2.get_pipe('ner').move_names == move_names
doc2 = nlp2(test_text) doc2 = nlp2(test_text)
for ent in doc2.ents: for ent in doc2.ents:
print(ent.label_, ent.text) print(ent.label_, ent.text)

View File

@ -571,8 +571,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
>> logistic >> logistic
) )
model = ( model = (
(linear_model | cnn_model) (linear_model | cnn_model)
>> output_layer >> output_layer

View File

@ -4,7 +4,7 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "2.1.0a9.dev1" __version__ = "2.1.0a9.dev2"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io" __uri__ = "https://spacy.io"
__author__ = "Explosion AI" __author__ = "Explosion AI"

View File

@ -290,7 +290,8 @@ class Errors(object):
"NBOR_RELOP.") "NBOR_RELOP.")
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have " E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
"have been declared in previous edges.") "have been declared in previous edges.")
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of tokens to merge") E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
"tokens to merge.")
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token" E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
" can only be part of one entity, so make sure the entities you're " " can only be part of one entity, so make sure the entities you're "
"setting don't overlap.") "setting don't overlap.")
@ -318,12 +319,12 @@ class Errors(object):
"So instead of pickling the span, pickle the Doc it belongs to or " "So instead of pickling the span, pickle the Doc it belongs to or "
"use Span.as_doc to convert the span to a standalone Doc object.") "use Span.as_doc to convert the span to a standalone Doc object.")
E113 = ("The newly split token can only have one root (head = 0).") E113 = ("The newly split token can only have one root (head = 0).")
E114 = ("The newly split token needs to have a root (head = 0)") E114 = ("The newly split token needs to have a root (head = 0).")
E115 = ("All subtokens must have associated heads") E115 = ("All subtokens must have associated heads.")
E116 = ("Cannot currently add labels to pre-trained text classifier. Add " E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
"labels before training begins. This functionality was available " "labels before training begins. This functionality was available "
"in previous versions, but had significant bugs that led to poor " "in previous versions, but had significant bugs that led to poor "
"performance") "performance.")
E117 = ("The newly split tokens must match the text of the original token. " E117 = ("The newly split tokens must match the text of the original token. "
"New orths: {new}. Old text: {old}.") "New orths: {new}. Old text: {old}.")

View File

@ -24,52 +24,68 @@ _latin_l_supplement = r"\u00DF-\u00F6\u00F8-\u00FF"
_latin_supplement = r"\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF" _latin_supplement = r"\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF"
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
_latin_u_extendedA = r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" \ _latin_u_extendedA = (
r"\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B" \ r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
r"\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158" \ r"\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B"
r"\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176" \ r"\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158"
r"\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176"
r"\u0178\u0179\u017B\u017D" r"\u0178\u0179\u017B\u017D"
_latin_l_extendedA = r"\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D" \ )
r"\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A" \ _latin_l_extendedA = (
r"\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157" \ r"\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D"
r"\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175" \ r"\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A"
r"\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157"
r"\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175"
r"\u0177\u017A\u017C\u017E\u017F" r"\u0177\u017A\u017C\u017E\u017F"
)
_latin_extendedA = r"\u0100-\u017F" _latin_extendedA = r"\u0100-\u017F"
# special characters - Khoisan, Pan-Nigerian, Pinyin, Romanian # special characters - Khoisan, Pan-Nigerian, Pinyin, Romanian
# those that are a combination of both upper and lower letters are only included in the group _latin_extendedB # those that are a combination of both upper and lower letters are only included in the group _latin_extendedB
_latin_u_extendedB = r"\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C" \ _latin_u_extendedB = (
r"\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5" \ r"\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C"
r"\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB" \ r"\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5"
r"\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA" \ r"\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB"
r"\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216" \ r"\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA"
r"\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232" \ r"\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216"
r"\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232"
r"\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E" r"\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E"
_latin_l_extendedB = r"\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5" \ )
r"\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9\u01BA\u01BD-\u01BF\u01C6\u01C9\u01CC" \ _latin_l_extendedB = (
r"\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7" \ r"\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5"
r"\u01E9\u01EB\u01ED\u01EF\u01F0\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205" \ r"\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9\u01BA\u01BD-\u01BF\u01C6\u01C9\u01CC"
r"\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221" \ r"\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7"
r"\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242" \ r"\u01E9\u01EB\u01ED\u01EF\u01F0\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205"
r"\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221"
r"\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242"
r"\u0247\u0249\u024B\u024D\u024F" r"\u0247\u0249\u024B\u024D\u024F"
)
_latin_extendedB = r"\u0180-\u01BF\u01C4-\u024F" _latin_extendedB = r"\u0180-\u01BF\u01C4-\u024F"
# special characters - Uighur, Uralic Phonetic # special characters - Uighur, Uralic Phonetic
_latin_u_extendedC = r"\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E\u2C7F" _latin_u_extendedC = (
_latin_l_extendedC = r"\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7B" r"\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E\u2C7F"
)
_latin_l_extendedC = (
r"\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7B"
)
_latin_extendedC = r"\u2C60-\u2C7B\u2C7E\u2C7F" _latin_extendedC = r"\u2C60-\u2C7B\u2C7E\u2C7F"
# special characters - phonetic, Mayan, Medieval # special characters - phonetic, Mayan, Medieval
_latin_u_extendedD = r"\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C" \ _latin_u_extendedD = (
r"\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758" \ r"\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C"
r"\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D" \ r"\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758"
r"\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E" \ r"\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D"
r"\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E"
r"\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uA7B8" r"\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uA7B8"
_latin_l_extendedD = r"\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D" \ )
r"\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759" \ _latin_l_extendedD = (
r"\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F\uA771-\uA778\uA77A" \ r"\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D"
r"\uA77C\uA77F\uA781\uA783\uA785\uA787\uA78C\uA78E\uA791\uA793-\uA795\uA797\uA799\uA79B" \ r"\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759"
r"\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F\uA771-\uA778\uA77A"
r"\uA77C\uA77F\uA781\uA783\uA785\uA787\uA78C\uA78E\uA791\uA793-\uA795\uA797\uA799\uA79B"
r"\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AF\uA7B5\uA7B7\uA7B9\uA7FA" r"\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AF\uA7B5\uA7B7\uA7B9\uA7FA"
)
_latin_extendedD = r"\uA722-\uA76F\uA771-\uA787\uA78B-\uA78E\uA790-\uA7B9\uA7FA" _latin_extendedD = r"\uA722-\uA76F\uA771-\uA787\uA78B-\uA78E\uA790-\uA7B9\uA7FA"
# special characters - phonetic Teuthonista and Sakha # special characters - phonetic Teuthonista and Sakha
@ -81,42 +97,80 @@ _latin_l_phonetic = r"\u0250-\u02AF\u1D00-\u1D25\u1D6B-\u1D77\u1D79-\u1D9A"
_latin_phonetic = _latin_l_phonetic _latin_phonetic = _latin_l_phonetic
# letters with multiple diacritics - Vietnamese # letters with multiple diacritics - Vietnamese
_latin_u_diacritics = r"\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A" \ _latin_u_diacritics = (
r"\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36" \ r"\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A"
r"\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52" \ r"\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36"
r"\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E" \ r"\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52"
r"\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A" \ r"\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E"
r"\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE" \ r"\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A"
r"\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8" \ r"\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE"
r"\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4" \ r"\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8"
r"\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4"
r"\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE" r"\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE"
_latin_l_diacritics = r"\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B" \ )
r"\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37" \ _latin_l_diacritics = (
r"\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53" \ r"\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B"
r"\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F" \ r"\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37"
r"\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B" \ r"\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53"
r"\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD" \ r"\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F"
r"\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9" \ r"\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B"
r"\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5" \ r"\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD"
r"\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9"
r"\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5"
r"\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF" r"\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF"
)
_latin_diacritics = r"\u1E00-\u1EFF" _latin_diacritics = r"\u1E00-\u1EFF"
# all lower latin classes # all lower latin classes
LATIN_LOWER_BASIC = _latin_l_standard + _latin_l_standard_fullwidth + _latin_l_supplement + _latin_l_extendedA LATIN_LOWER_BASIC = (
LATIN_LOWER = LATIN_LOWER_BASIC + _latin_l_extendedB + _latin_l_extendedC + _latin_l_extendedD + _latin_l_extendedE \ _latin_l_standard
+ _latin_l_phonetic + _latin_l_diacritics + _latin_l_standard_fullwidth
+ _latin_l_supplement
+ _latin_l_extendedA
)
LATIN_LOWER = (
LATIN_LOWER_BASIC
+ _latin_l_extendedB
+ _latin_l_extendedC
+ _latin_l_extendedD
+ _latin_l_extendedE
+ _latin_l_phonetic
+ _latin_l_diacritics
)
# all upper latin classes # all upper latin classes
LATIN_UPPER_BASIC = _latin_u_standard + _latin_u_standard_fullwidth + _latin_u_supplement + _latin_u_extendedA LATIN_UPPER_BASIC = (
LATIN_UPPER = LATIN_UPPER_BASIC + _latin_u_extendedB + _latin_u_extendedC + _latin_u_extendedD + _latin_u_diacritics _latin_u_standard
+ _latin_u_standard_fullwidth
+ _latin_u_supplement
+ _latin_u_extendedA
)
LATIN_UPPER = (
LATIN_UPPER_BASIC
+ _latin_u_extendedB
+ _latin_u_extendedC
+ _latin_u_extendedD
+ _latin_u_diacritics
)
# all latin classes # all latin classes
LATIN_BASIC = _latin_standard + _latin_standard_fullwidth + _latin_supplement + _latin_extendedA LATIN_BASIC = (
LATIN = LATIN_BASIC + _latin_extendedB + _latin_extendedC + _latin_extendedD + _latin_extendedE \ _latin_standard + _latin_standard_fullwidth + _latin_supplement + _latin_extendedA
+ _latin_phonetic + _latin_diacritics )
LATIN = (
LATIN_BASIC
+ _latin_extendedB
+ _latin_extendedC
+ _latin_extendedD
+ _latin_extendedE
+ _latin_phonetic
+ _latin_diacritics
)
_persian = r"\u0620-\u064A\u066E-\u06D5\u06E5-\u06FF\u0750-\u077F\u08A0-\u08BD" \ _persian = (
r"\u0620-\u064A\u066E-\u06D5\u06E5-\u06FF\u0750-\u077F\u08A0-\u08BD"
r"\uFB50-\uFBB1\uFBD3-\uFD3D\uFD50-\uFDC7\uFDF0-\uFDFB\uFE70-\uFEFC\U0001EE00-\U0001EEBB" r"\uFB50-\uFBB1\uFBD3-\uFD3D\uFD50-\uFDC7\uFDF0-\uFDFB\uFE70-\uFEFC\U0001EE00-\U0001EEBB"
)
_russian_lower = r"ёа-я" _russian_lower = r"ёа-я"
_russian_upper = r"ЁА-Я" _russian_upper = r"ЁА-Я"
@ -165,33 +219,35 @@ _hyphens = "- — -- --- —— ~"
# Various symbols like dingbats, but also emoji # Various symbols like dingbats, but also emoji
# Details: https://www.compart.com/en/unicode/category/So # Details: https://www.compart.com/en/unicode/category/So
_other_symbols = r"\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70" \ _other_symbols = (
r"\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34" \ r"\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70"
r"\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399" \ r"\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34"
r"\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116" \ r"\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399"
r"\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B" \ r"\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116"
r"\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3" \ r"\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B"
r"\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB" \ r"\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3"
r"\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E" \ r"\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB"
r"\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95" \ r"\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E"
r"\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB" \ r"\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95"
r"\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3" \ r"\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB"
r"\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF" \ r"\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3"
r"\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC" \ r"\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF"
r"\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B" \ r"\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC"
r"\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F" \ r"\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B"
r"\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164" \ r"\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F"
r"\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8" \ r"\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164"
r"\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A" \ r"\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8"
r"\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B" \ r"\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A"
r"\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF" \ r"\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B"
r"\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202" \ r"\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF"
r"\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265" \ r"\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202"
r"\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9" \ r"\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265"
r"\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847" \ r"\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9"
r"\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B" \ r"\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847"
r"\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2" \ r"\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B"
r"\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2"
r"\U0001F9B0-\U0001F9B9\U0001F9C0-\U0001F9C2\U0001F9D0-\U0001F9FF\U0001FA60-\U0001FA6D" r"\U0001F9B0-\U0001F9B9\U0001F9C0-\U0001F9C2\U0001F9D0-\U0001F9FF\U0001FA60-\U0001FA6D"
)
UNITS = merge_chars(_units) UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency) CURRENCY = merge_chars(_currency)

View File

@ -19,6 +19,7 @@ cdef struct WeightsC:
const float* feat_bias const float* feat_bias
const float* hidden_bias const float* hidden_bias
const float* hidden_weights const float* hidden_weights
const float* seen_classes
cdef struct ActivationsC: cdef struct ActivationsC:

View File

@ -44,8 +44,10 @@ cdef WeightsC get_c_weights(model) except *:
output.feat_bias = <const float*>state2vec.bias.data output.feat_bias = <const float*>state2vec.bias.data
cdef np.ndarray vec2scores_W = model.vec2scores.W cdef np.ndarray vec2scores_W = model.vec2scores.W
cdef np.ndarray vec2scores_b = model.vec2scores.b cdef np.ndarray vec2scores_b = model.vec2scores.b
cdef np.ndarray class_mask = model._class_mask
output.hidden_weights = <const float*>vec2scores_W.data output.hidden_weights = <const float*>vec2scores_W.data
output.hidden_bias = <const float*>vec2scores_b.data output.hidden_bias = <const float*>vec2scores_b.data
output.seen_classes = <const float*>class_mask.data
return output return output
@ -115,6 +117,16 @@ cdef void predict_states(ActivationsC* A, StateC** states,
for i in range(n.states): for i in range(n.states):
VecVec.add_i(&A.scores[i*n.classes], VecVec.add_i(&A.scores[i*n.classes],
W.hidden_bias, 1., n.classes) W.hidden_bias, 1., n.classes)
# Set unseen classes to minimum value
i = 0
min_ = A.scores[0]
for i in range(1, n.states * n.classes):
if A.scores[i] < min_:
min_ = A.scores[i]
for i in range(n.states):
for j in range(n.classes):
if not W.seen_classes[j]:
A.scores[i*n.classes+j] = min_
cdef void sum_state_features(float* output, cdef void sum_state_features(float* output,
@ -189,12 +201,17 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
class ParserModel(Model): class ParserModel(Model):
def __init__(self, tok2vec, lower_model, upper_model): def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
Model.__init__(self) Model.__init__(self)
self._layers = [tok2vec, lower_model, upper_model] self._layers = [tok2vec, lower_model, upper_model]
self.unseen_classes = set()
if unseen_classes:
for class_ in unseen_classes:
self.unseen_classes.add(class_)
def begin_update(self, docs, drop=0.): def begin_update(self, docs, drop=0.):
step_model = ParserStepModel(docs, self._layers, drop=drop) step_model = ParserStepModel(docs, self._layers, drop=drop,
unseen_classes=self.unseen_classes)
def finish_parser_update(golds, sgd=None): def finish_parser_update(golds, sgd=None):
step_model.make_updates(sgd) step_model.make_updates(sgd)
return None return None
@ -207,9 +224,8 @@ class ParserModel(Model):
with Model.use_device('cpu'): with Model.use_device('cpu'):
larger = Affine(new_output, smaller.nI) larger = Affine(new_output, smaller.nI)
# Set nan as value for unseen classes, to prevent prediction. larger.W.fill(0.0)
larger.W.fill(self.ops.xp.nan) larger.b.fill(0.0)
larger.b.fill(self.ops.xp.nan)
# It seems very unhappy if I pass these as smaller.W? # It seems very unhappy if I pass these as smaller.W?
# Seems to segfault. Maybe it's a descriptor protocol thing? # Seems to segfault. Maybe it's a descriptor protocol thing?
smaller_W = smaller.W smaller_W = smaller.W
@ -221,6 +237,8 @@ class ParserModel(Model):
larger_W[:smaller.nO] = smaller_W larger_W[:smaller.nO] = smaller_W
larger_b[:smaller.nO] = smaller_b larger_b[:smaller.nO] = smaller_b
self._layers[-1] = larger self._layers[-1] = larger
for i in range(smaller.nO, new_output):
self.unseen_classes.add(i)
def begin_training(self, X, y=None): def begin_training(self, X, y=None):
self.lower.begin_training(X, y=y) self.lower.begin_training(X, y=y)
@ -239,18 +257,32 @@ class ParserModel(Model):
class ParserStepModel(Model): class ParserStepModel(Model):
def __init__(self, docs, layers, drop=0.): def __init__(self, docs, layers, unseen_classes=None, drop=0.):
self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop) self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1], self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
drop=drop) drop=drop)
self.vec2scores = layers[-1] self.vec2scores = layers[-1]
self.cuda_stream = util.get_cuda_stream() self.cuda_stream = util.get_cuda_stream()
self.backprops = [] self.backprops = []
self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f')
self._class_mask.fill(1)
if unseen_classes is not None:
for class_ in unseen_classes:
self._class_mask[class_] = 0.
@property @property
def nO(self): def nO(self):
return self.state2vec.nO return self.state2vec.nO
def class_is_unseen(self, class_):
return self._class_mask[class_]
def mark_class_unseen(self, class_):
self._class_mask[class_] = 0
def mark_class_seen(self, class_):
self._class_mask[class_] = 1
def begin_update(self, states, drop=0.): def begin_update(self, states, drop=0.):
token_ids = self.get_token_ids(states) token_ids = self.get_token_ids(states)
vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0) vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
@ -258,24 +290,12 @@ class ParserStepModel(Model):
if mask is not None: if mask is not None:
vector *= mask vector *= mask
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
# We can have nans from unseen classes. # If the class is unseen, make sure its score is minimum
# For backprop purposes, we want to treat unseen classes as having the scores[:, self._class_mask == 0] = numpy.nanmin(scores)
# lowest score.
# numpy's nan_to_num function doesn't take a value, and nan is replaced
# by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
# Note that scores is always a numpy array! Should fix #3112
scores[numpy.isnan(scores)] = -numpy.inf
numpy.nan_to_num(scores, copy=False)
def backprop_parser_step(d_scores, sgd=None): def backprop_parser_step(d_scores, sgd=None):
# If we have a non-zero gradient for a previously unseen class, # Zero vectors for unseen classes
# replace the weight with 0. d_scores *= self._class_mask
new_classes = self.vec2scores.ops.xp.logical_and(
self.vec2scores.ops.xp.isnan(self.vec2scores.b),
d_scores.any(axis=0)
)
self.vec2scores.b[new_classes] = 0.
self.vec2scores.W[new_classes] = 0.
d_vector = get_d_vector(d_scores, sgd=sgd) d_vector = get_d_vector(d_scores, sgd=sgd)
if mask is not None: if mask is not None:
d_vector *= mask d_vector *= mask

View File

@ -163,6 +163,8 @@ cdef class Parser:
added = self.moves.add_action(action, label) added = self.moves.add_action(action, label)
if added: if added:
resized = True resized = True
if resized and "nr_class" in self.cfg:
self.cfg["nr_class"] = self.moves.n_moves
if self.model not in (True, False, None) and resized: if self.model not in (True, False, None) and resized:
self.model.resize_output(self.moves.n_moves) self.model.resize_output(self.moves.n_moves)
@ -435,22 +437,22 @@ cdef class Parser:
if self._rehearsal_model is None: if self._rehearsal_model is None:
return None return None
losses.setdefault(self.name, 0.) losses.setdefault(self.name, 0.)
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
# This is pretty dirty, but the NER can resize itself in init_batch, # This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to # if labels are missing. We therefore have to check whether we need to
# expand our model output. # expand our model output.
self.model.resize_output(self.moves.n_moves) self.model.resize_output(self.moves.n_moves)
self._rehearsal_model.resize_output(self.moves.n_moves)
# Prepare the stepwise model, and get the callback for finishing the batch # Prepare the stepwise model, and get the callback for finishing the batch
tutor = self._rehearsal_model(docs) tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
model, finish_update = self.model.begin_update(docs, drop=0.0) model, finish_update = self.model.begin_update(docs, drop=0.0)
n_scores = 0. n_scores = 0.
loss = 0. loss = 0.
non_zeroed_classes = self._rehearsal_model.upper.W.any(axis=1)
while states: while states:
targets, _ = tutor.begin_update(states) targets, _ = tutor.begin_update(states, drop=0.)
guesses, backprop = model.begin_update(states) guesses, backprop = model.begin_update(states, drop=0.)
d_scores = (targets - guesses) / targets.shape[0] d_scores = (guesses - targets) / targets.shape[0]
d_scores *= non_zeroed_classes
# If all weights for an output are 0 in the original model, don't # If all weights for an output are 0 in the original model, don't
# supervise that output. This allows us to add classes. # supervise that output. This allows us to add classes.
loss += (d_scores**2).sum() loss += (d_scores**2).sum()
@ -543,6 +545,9 @@ cdef class Parser:
memset(is_valid, 0, self.moves.n_moves * sizeof(int)) memset(is_valid, 0, self.moves.n_moves * sizeof(int))
memset(costs, 0, self.moves.n_moves * sizeof(float)) memset(costs, 0, self.moves.n_moves * sizeof(float))
self.moves.set_costs(is_valid, costs, state, gold) self.moves.set_costs(is_valid, costs, state, gold)
for j in range(self.moves.n_moves):
if costs[j] <= 0.0 and j in self.model.unseen_classes:
self.model.unseen_classes.remove(j)
cpu_log_loss(c_d_scores, cpu_log_loss(c_d_scores,
costs, is_valid, &scores[i, 0], d_scores.shape[1]) costs, is_valid, &scores[i, 0], d_scores.shape[1])
c_d_scores += d_scores.shape[1] c_d_scores += d_scores.shape[1]

View File

@ -147,6 +147,8 @@ cdef class TransitionSystem:
def initialize_actions(self, labels_by_action, min_freq=None): def initialize_actions(self, labels_by_action, min_freq=None):
self.labels = {} self.labels = {}
self.n_moves = 0 self.n_moves = 0
added_labels = []
added_actions = {}
for action, label_freqs in sorted(labels_by_action.items()): for action, label_freqs in sorted(labels_by_action.items()):
action = int(action) action = int(action)
# Make sure we take a copy here, and that we get a Counter # Make sure we take a copy here, and that we get a Counter
@ -157,6 +159,15 @@ cdef class TransitionSystem:
sorted_labels.sort() sorted_labels.sort()
sorted_labels.reverse() sorted_labels.reverse()
for freq, label_str in sorted_labels: for freq, label_str in sorted_labels:
if freq < 0:
added_labels.append((freq, label_str))
added_actions.setdefault(label_str, []).append(action)
else:
self.add_action(int(action), label_str)
self.labels[action][label_str] = freq
added_labels.sort(reverse=True)
for freq, label_str in added_labels:
for action in added_actions[label_str]:
self.add_action(int(action), label_str) self.add_action(int(action), label_str)
self.labels[action][label_str] = freq self.labels[action][label_str] = freq

View File

@ -6,7 +6,6 @@ import pytest
import numpy import numpy
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.attrs import LEMMA
from spacy.errors import ModelsWarning from spacy.errors import ModelsWarning
from ..util import get_doc from ..util import get_doc
@ -139,81 +138,6 @@ def test_doc_api_set_ents(en_tokenizer):
assert tokens.ents[0].end == 4 assert tokens.ents[0].end == 4
def test_doc_api_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
# merge both with bulk merge
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
retokenizer.merge(doc[7:9], attrs=attrs)
assert len(doc) == 6
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
def test_doc_api_merge_children(en_tokenizer):
"""Test that attachments work correctly after merging."""
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
for word in doc:
if word.i < word.head.i:
assert word in list(word.head.lefts)
elif word.i > word.head.i:
assert word in list(word.head.rights)
def test_doc_api_merge_hang(en_tokenizer):
text = "through North and South Carolina"
doc = en_tokenizer(text)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
def test_doc_api_retokenizer(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7])
assert len(doc) == 7
assert doc[4].text == "the beach boys"
def test_doc_api_retokenizer_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
# test both string and integer attributes and values
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
assert len(doc) == 7
assert doc[4].text == "the beach boys"
assert doc[4].lemma_ == "boys"
assert doc[4].ent_type_ == "ORG"
@pytest.mark.xfail
def test_doc_api_retokenizer_lex_attrs(en_tokenizer):
"""Test that lexical attributes can be changed (see #2390)."""
doc = en_tokenizer("WKRO played beach boys songs")
assert not any(token.is_stop for token in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
assert doc[2].text == "beach boys"
assert doc[2].lemma_ == "boys"
assert doc[2].is_stop
new_doc = Doc(doc.vocab, words=["beach boys"])
assert new_doc[0].is_stop
def test_doc_api_sents_empty_string(en_tokenizer): def test_doc_api_sents_empty_string(en_tokenizer):
doc = en_tokenizer("") doc = en_tokenizer("")
doc.is_parsed = True doc.is_parsed = True

View File

@ -1,14 +1,89 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.attrs import LEMMA
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
import pytest
from ..util import get_doc from ..util import get_doc
def test_spans_merge_tokens(en_tokenizer): def test_doc_retokenize_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
retokenizer.merge(doc[7:9], attrs=attrs)
assert len(doc) == 6
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
def test_doc_retokenize_merge_children(en_tokenizer):
"""Test that attachments work correctly after merging."""
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
for word in doc:
if word.i < word.head.i:
assert word in list(word.head.lefts)
elif word.i > word.head.i:
assert word in list(word.head.rights)
def test_doc_retokenize_merge_hang(en_tokenizer):
text = "through North and South Carolina"
doc = en_tokenizer(text)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
def test_doc_retokenize_retokenizer(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7])
assert len(doc) == 7
assert doc[4].text == "the beach boys"
def test_doc_retokenize_retokenizer_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
# test both string and integer attributes and values
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
assert len(doc) == 7
assert doc[4].text == "the beach boys"
assert doc[4].lemma_ == "boys"
assert doc[4].ent_type_ == "ORG"
@pytest.mark.xfail
def test_doc_retokenize_lex_attrs(en_tokenizer):
"""Test that lexical attributes can be changed (see #2390)."""
doc = en_tokenizer("WKRO played beach boys songs")
assert not any(token.is_stop for token in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
assert doc[2].text == "beach boys"
assert doc[2].lemma_ == "boys"
assert doc[2].is_stop
new_doc = Doc(doc.vocab, words=["beach boys"])
assert new_doc[0].is_stop
def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
text = "Los Angeles start." text = "Los Angeles start."
heads = [1, 1, 0, -1] heads = [1, 1, 0, -1]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
@ -25,7 +100,7 @@ def test_spans_merge_tokens(en_tokenizer):
assert doc[0].ent_type_ == "GPE" assert doc[0].ent_type_ == "GPE"
def test_spans_merge_heads(en_tokenizer): def test_doc_retokenize_spans_merge_heads(en_tokenizer):
text = "I found a pilates class near work." text = "I found a pilates class near work."
heads = [1, 0, 2, 1, -3, -1, -1, -6] heads = [1, 0, 2, 1, -3, -1, -1, -6]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
@ -43,7 +118,7 @@ def test_spans_merge_heads(en_tokenizer):
assert doc[5].head.i == 4 assert doc[5].head.i == 4
def test_spans_merge_non_disjoint(en_tokenizer): def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
text = "Los Angeles start." text = "Los Angeles start."
doc = en_tokenizer(text) doc = en_tokenizer(text)
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -58,7 +133,7 @@ def test_spans_merge_non_disjoint(en_tokenizer):
) )
def test_span_np_merges(en_tokenizer): def test_doc_retokenize_span_np_merges(en_tokenizer):
text = "displaCy is a parse tool built with Javascript" text = "displaCy is a parse tool built with Javascript"
heads = [1, 0, 2, 1, -3, -1, -1, -1] heads = [1, 0, 2, 1, -3, -1, -1, -1]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
@ -87,7 +162,7 @@ def test_span_np_merges(en_tokenizer):
retokenizer.merge(ent) retokenizer.merge(ent)
def test_spans_entity_merge(en_tokenizer): def test_doc_retokenize_spans_entity_merge(en_tokenizer):
# fmt: off # fmt: off
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1] heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
@ -108,7 +183,7 @@ def test_spans_entity_merge(en_tokenizer):
assert len(doc) == 15 assert len(doc) == 15
def test_spans_entity_merge_iob(): def test_doc_retokenize_spans_entity_merge_iob():
# Test entity IOB stays consistent after merging # Test entity IOB stays consistent after merging
words = ["a", "b", "c", "d", "e"] words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
@ -147,7 +222,7 @@ def test_spans_entity_merge_iob():
assert doc[4].ent_iob_ == "I" assert doc[4].ent_iob_ == "I"
def test_spans_sentence_update_after_merge(en_tokenizer): def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
# fmt: off # fmt: off
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7] heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
@ -155,7 +230,6 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
'compound', 'dobj', 'punct'] 'compound', 'dobj', 'punct']
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
sent1, sent2 = list(doc.sents) sent1, sent2 = list(doc.sents)
@ -169,7 +243,7 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
assert len(sent2) == init_len2 - 1 assert len(sent2) == init_len2 - 1
def test_spans_subtree_size_check(en_tokenizer): def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
# fmt: off # fmt: off
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale" text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2] heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
@ -177,7 +251,6 @@ def test_spans_subtree_size_check(en_tokenizer):
"nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound", "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
"dobj"] "dobj"]
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
sent1 = list(doc.sents)[0] sent1 = list(doc.sents)[0]

View File

@ -8,7 +8,7 @@ from spacy.tokens import Doc
from ..util import get_doc from ..util import get_doc
def test_doc_split(en_vocab): def test_doc_retokenize_split(en_vocab):
words = ["LosAngeles", "start", "."] words = ["LosAngeles", "start", "."]
heads = [1, 1, 0] heads = [1, 1, 0]
doc = get_doc(en_vocab, words=words, heads=heads) doc = get_doc(en_vocab, words=words, heads=heads)
@ -41,7 +41,7 @@ def test_doc_split(en_vocab):
assert len(str(doc)) == 19 assert len(str(doc)) == 19
def test_split_dependencies(en_vocab): def test_doc_retokenize_split_dependencies(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
dep1 = doc.vocab.strings.add("amod") dep1 = doc.vocab.strings.add("amod")
dep2 = doc.vocab.strings.add("subject") dep2 = doc.vocab.strings.add("subject")
@ -56,7 +56,7 @@ def test_split_dependencies(en_vocab):
assert doc[1].dep == dep2 assert doc[1].dep == dep2
def test_split_heads_error(en_vocab): def test_doc_retokenize_split_heads_error(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
# Not enough heads # Not enough heads
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -69,7 +69,7 @@ def test_split_heads_error(en_vocab):
retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]]) retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
def test_spans_entity_merge_iob(): def test_doc_retokenize_spans_entity_split_iob():
# Test entity IOB stays consistent after merging # Test entity IOB stays consistent after merging
words = ["abc", "d", "e"] words = ["abc", "d", "e"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
@ -84,7 +84,7 @@ def test_spans_entity_merge_iob():
assert doc[3].ent_iob_ == "I" assert doc[3].ent_iob_ == "I"
def test_spans_sentence_update_after_merge(en_vocab): def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
# fmt: off # fmt: off
words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He", words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
"lives", "in", "England", "and", "loves", "JoePasquale", "."] "lives", "in", "England", "and", "loves", "JoePasquale", "."]
@ -114,7 +114,7 @@ def test_spans_sentence_update_after_merge(en_vocab):
assert len(sent2) == init_len2 + 1 assert len(sent2) == init_len2 + 1
def test_split_orths_mismatch(en_vocab): def test_doc_retokenize_split_orths_mismatch(en_vocab):
"""Test that the regular retokenizer.split raises an error if the orths """Test that the regular retokenizer.split raises an error if the orths
don't match the original token text. There might still be a method that don't match the original token text. There might still be a method that
allows this, but for the default use cases, merging and splitting should allows this, but for the default use cases, merging and splitting should

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokens import Token, Doc from spacy.tokens import Token, Doc
@ -59,6 +58,5 @@ def test_issue_1971_4(en_vocab):
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
matcher.add("TEST", None, pattern) matcher.add("TEST", None, pattern)
matches = matcher(doc) matches = matcher(doc)
# Interesting: uncommenting this causes a segmentation fault, so there's # Uncommenting this caused a segmentation fault
# definitely something going on here assert len(matches) == 1
# assert len(matches) == 1

View File

@ -1,7 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
import numpy import numpy
from spacy import displacy from spacy import displacy

View File

@ -1,7 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.lang.en import English from spacy.lang.en import English

View File

@ -315,6 +315,11 @@ def read_regex(path):
def compile_prefix_regex(entries): def compile_prefix_regex(entries):
"""Compile a list of prefix rules into a regex object.
entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES.
RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search.
"""
if "(" in entries: if "(" in entries:
# Handle deprecated data # Handle deprecated data
expression = "|".join( expression = "|".join(
@ -327,11 +332,21 @@ def compile_prefix_regex(entries):
def compile_suffix_regex(entries): def compile_suffix_regex(entries):
"""Compile a list of suffix rules into a regex object.
entries (tuple): The suffix rules, e.g. spacy.lang.punctuation.TOKENIZER_SUFFIXES.
RETURNS (regex object): The regex object. to be used for Tokenizer.suffix_search.
"""
expression = "|".join([piece + "$" for piece in entries if piece.strip()]) expression = "|".join([piece + "$" for piece in entries if piece.strip()])
return re.compile(expression) return re.compile(expression)
def compile_infix_regex(entries): def compile_infix_regex(entries):
"""Compile a list of infix rules into a regex object.
entries (tuple): The infix rules, e.g. spacy.lang.punctuation.TOKENIZER_INFIXES.
RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
"""
expression = "|".join([piece for piece in entries if piece.strip()]) expression = "|".join([piece for piece in entries if piece.strip()])
return re.compile(expression) return re.compile(expression)

View File

@ -504,6 +504,57 @@ an error if key doesn't match `ORTH` values.
| `*addition_dicts` | dicts | Exception dictionaries to add to the base exceptions, in order. | | `*addition_dicts` | dicts | Exception dictionaries to add to the base exceptions, in order. |
| **RETURNS** | dict | Combined tokenizer exceptions. | | **RETURNS** | dict | Combined tokenizer exceptions. |
### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"}
Compile a sequence of prefix rules into a regex object.
> #### Example
>
> ```python
> prefixes = ("§", "%", "=", r"\+")
> prefix_regex = util.compile_prefix_regex(prefixes)
> nlp.tokenizer.prefix_search = prefix_regex.search
> ```
| Name | Type | Description |
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). |
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
Compile a sequence of suffix rules into a regex object.
> #### Example
>
> ```python
> suffixes = ("'s", "'S", r"(?<=[0-9])\+")
> suffix_regex = util.compile_suffix_regex(suffixes)
> nlp.tokenizer.suffix_search = suffix_regex.search
> ```
| Name | Type | Description |
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). |
### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
Compile a sequence of infix rules into a regex object.
> #### Example
>
> ```python
> infixes = ("…", "-", "—", r"(?<=[0-9])[+\-\*^](?=[0-9-])")
> infix_regex = util.compile_infix_regex(infixes)
> nlp.tokenizer.infix_finditer = infix_regex.finditer
> ```
| Name | Type | Description |
| ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). |
### util.minibatch {#util.minibatch tag="function" new="2"} ### util.minibatch {#util.minibatch tag="function" new="2"}
Iterate over batches of items. `size` may be an iterator, so that batch-size can Iterate over batches of items. `size` may be an iterator, so that batch-size can

View File

@ -812,6 +812,40 @@ only be applied at the **end of a token**, so your expression should end with a
</Infobox> </Infobox>
#### Adding to existing rule sets {#native-tokenizer-additions}
In many situations, you don't necessarily need entirely custom rules. Sometimes
you just want to add another character to the prefixes, suffixes or infixes. The
default prefix, suffix and infix rules are available via the `nlp` object's
`Defaults` and the [`Tokenizer.suffix_search`](/api/tokenizer#attributes)
attribute is writable, so you can overwrite it with a compiled regular
expression object using of the modified default rules. spaCy ships with utility
functions to help you compile the regular expressions for example,
[`compile_suffix_regex`](/api/top-level#util.compile_suffix_regex):
```python
suffixes = nlp.Defaults.suffixes + (r'''-+$''',)
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search
```
For an overview of the default regular expressions, see
[`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py).
The `Tokenizer.suffix_search` attribute should be a function which takes a
unicode string and returns a **regex match object** or `None`. Usually we use
the `.search` attribute of a compiled regex object, but you can use some other
function that behaves the same way.
<Infobox title="Important note" variant="warning">
If you're using a statistical model, writing to the `nlp.Defaults` or
`English.Defaults` directly won't work, since the regular expressions are read
from the model and will be compiled when you load it. You'll only see the effect
if you call [`spacy.blank`](/api/top-level#spacy.blank) or
`Defaults.create_tokenizer()`.
</Infobox>
### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer} ### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer}
The tokenizer is the first component of the processing pipeline and the only one The tokenizer is the first component of the processing pipeline and the only one