Merge branch 'develop' into spacy.io

2025-10-26 05:31:15 +03:00 · 2019-02-24 18:35:32 +01:00 · 2019-02-24 18:35:32 +01:00 · 948ca2bb3e
commit 948ca2bb3e
parent 876ef840c4 403b9cd58b
20 changed files with 450 additions and 244 deletions
--- a/README.md
+++ b/README.md
@ -54,9 +54,9 @@ valuable if it's shared publicly, so that more people can benefit from it.
 | Type                     | Platforms                                              |
 | ------------------------ | ------------------------------------------------------ |
-| 🚨**Bug Reports**        | [GitHub Issue Tracker]                                 |
+| 🚨 **Bug Reports**       | [GitHub Issue Tracker]                                 |
 | 🎁 **Feature Requests**  | [GitHub Issue Tracker]                                 |
-| 👩‍💻**Usage Questions**    | [Stack Overflow] · [Gitter Chat] · [Reddit User Group] |
+| 👩‍💻 **Usage Questions**   | [Stack Overflow] · [Gitter Chat] · [Reddit User Group] |
 | 🗯 **General Discussion** | [Gitter Chat] · [Reddit User Group]                    |
 [github issue tracker]: https://github.com/explosion/spaCy/issues
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@ -4,7 +4,7 @@ import random
 import srsly
 import spacy
 from spacy.gold import GoldParse
-from spacy.util import minibatch
+from spacy.util import minibatch, compounding
 LABEL = "ANIMAL"
@ -54,9 +54,17 @@ def main(model_name, unlabelled_loc):
    nlp.get_pipe("ner").add_label(LABEL)
    raw_docs = list(read_raw_data(nlp, unlabelled_loc))
    optimizer = nlp.resume_training()
    # Avoid use of Adam when resuming training. I don't understand this well
    # yet, but I'm getting weird results from Adam. Try commenting out the
    # nlp.update(), and using Adam -- you'll find the models drift apart.
    # I guess Adam is losing precision, introducing gradient noise?
    optimizer.alpha = 0.1
    optimizer.b1 = 0.0
    optimizer.b2 = 0.0
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    sizes = compounding(1.0, 4.0, 1.001)
    with nlp.disable_pipes(*other_pipes):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
@ -64,13 +72,22 @@ def main(model_name, unlabelled_loc):
            losses = {}
            r_losses = {}
            # batch up the examples using spaCy's minibatch
-            raw_batches = minibatch(raw_docs, size=batch_size)
+            raw_batches = minibatch(raw_docs, size=4)
-            for doc, gold in TRAIN_DATA:
+            for batch in minibatch(TRAIN_DATA, size=sizes):
-                nlp.update([doc], [gold], sgd=optimizer, drop=dropout, losses=losses)
+                docs, golds = zip(*batch)
                nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
                raw_batch = list(next(raw_batches))
                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
            print("Losses", losses)
            print("R. Losses", r_losses)
    print(nlp.get_pipe('ner').model.unseen_classes)
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
 if __name__ == "__main__":
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -45,19 +45,19 @@ LABEL = "ANIMAL"
 TRAIN_DATA = [
    (
        "Horses are too tall and they pretend to care about your feelings",
-        {"entities": [(0, 6, "ANIMAL")]},
+        {"entities": [(0, 6, LABEL)]},
    ),
    ("Do they bite?", {"entities": []}),
    (
        "horses are too tall and they pretend to care about your feelings",
-        {"entities": [(0, 6, "ANIMAL")]},
+        {"entities": [(0, 6, LABEL)]},
    ),
-    ("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}),
+    ("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}),
    (
        "they pretend to care about your feelings, those horses",
-        {"entities": [(48, 54, "ANIMAL")]},
+        {"entities": [(48, 54, LABEL)]},
    ),
-    ("horses?", {"entities": [(0, 6, "ANIMAL")]}),
+    ("horses?", {"entities": [(0, 6, LABEL)]}),
 ]
@ -67,8 +67,9 @@ TRAIN_DATA = [
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
 )
-def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
+def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
@ -85,21 +86,22 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
        ner = nlp.get_pipe("ner")
    ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    ner.add_label('VEGETABLE')
    if model is None:
        optimizer = nlp.begin_training()
    else:
-        # Note that 'begin_training' initializes the models, so it'll zero out
+        optimizer = nlp.resume_training()
-        # existing entity types.
+    move_names = list(ner.move_names)
        optimizer = nlp.entity.create_optimizer()
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
@ -124,6 +126,8 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe('ner').move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -571,8 +571,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
                zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
                >> logistic
            )
        model = (
            (linear_model | cnn_model)
            >> output_layer
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,7 +4,7 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "2.1.0a9.dev1"
+__version__ = "2.1.0a9.dev2"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -290,7 +290,8 @@ class Errors(object):
            "NBOR_RELOP.")
    E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
            "have been declared in previous edges.")
-    E102 = ("Can't merge non-disjoint spans. '{token}' is already part of tokens to merge")
+    E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
            "tokens to merge.")
    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
            " can only be part of one entity, so make sure the entities you're "
            "setting don't overlap.")
@ -318,12 +319,12 @@ class Errors(object):
            "So instead of pickling the span, pickle the Doc it belongs to or "
            "use Span.as_doc to convert the span to a standalone Doc object.")
    E113 = ("The newly split token can only have one root (head = 0).")
-    E114 = ("The newly split token needs to have a root (head = 0)")
+    E114 = ("The newly split token needs to have a root (head = 0).")
-    E115 = ("All subtokens must have associated heads")
+    E115 = ("All subtokens must have associated heads.")
    E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
            "labels before training begins. This functionality was available "
            "in previous versions, but had significant bugs that led to poor "
-            "performance")
+            "performance.")
    E117 = ("The newly split tokens must match the text of the original token. "
            "New orths: {new}. Old text: {old}.")
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -24,52 +24,68 @@ _latin_l_supplement = r"\u00DF-\u00F6\u00F8-\u00FF"
 _latin_supplement = r"\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF"
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
-_latin_u_extendedA = r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" \
+_latin_u_extendedA = (
-                         r"\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B" \
+    r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
-                         r"\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158" \
+    r"\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B"
-                         r"\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176" \
+    r"\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158"
    r"\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176"
    r"\u0178\u0179\u017B\u017D"
-_latin_l_extendedA = r"\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D" \
+)
-                        r"\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A" \
+_latin_l_extendedA = (
-                        r"\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157" \
+    r"\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D"
-                        r"\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175" \
+    r"\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A"
    r"\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157"
    r"\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175"
    r"\u0177\u017A\u017C\u017E\u017F"
 )
 _latin_extendedA = r"\u0100-\u017F"
 # special characters - Khoisan, Pan-Nigerian, Pinyin, Romanian
 # those that are a combination of both upper and lower letters are only included in the group _latin_extendedB
-_latin_u_extendedB = r"\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C" \
+_latin_u_extendedB = (
-                         r"\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5" \
+    r"\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C"
-                         r"\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB" \
+    r"\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5"
-                         r"\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA" \
+    r"\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB"
-                         r"\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216" \
+    r"\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA"
-                         r"\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232" \
+    r"\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216"
    r"\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232"
    r"\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E"
-_latin_l_extendedB = r"\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5" \
+)
-                         r"\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9\u01BA\u01BD-\u01BF\u01C6\u01C9\u01CC" \
+_latin_l_extendedB = (
-                         r"\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7" \
+    r"\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5"
-                         r"\u01E9\u01EB\u01ED\u01EF\u01F0\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205" \
+    r"\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9\u01BA\u01BD-\u01BF\u01C6\u01C9\u01CC"
-                         r"\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221" \
+    r"\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7"
-                         r"\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242" \
+    r"\u01E9\u01EB\u01ED\u01EF\u01F0\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205"
    r"\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221"
    r"\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242"
    r"\u0247\u0249\u024B\u024D\u024F"
 )
 _latin_extendedB = r"\u0180-\u01BF\u01C4-\u024F"
 # special characters - Uighur, Uralic Phonetic
-_latin_u_extendedC = r"\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E\u2C7F"
+_latin_u_extendedC = (
-_latin_l_extendedC = r"\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7B"
+    r"\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E\u2C7F"
 )
 _latin_l_extendedC = (
    r"\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7B"
 )
 _latin_extendedC = r"\u2C60-\u2C7B\u2C7E\u2C7F"
 # special characters - phonetic, Mayan, Medieval
-_latin_u_extendedD = r"\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C" \
+_latin_u_extendedD = (
-                         r"\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758" \
+    r"\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C"
-                         r"\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D" \
+    r"\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758"
-                         r"\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E" \
+    r"\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D"
    r"\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E"
    r"\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uA7B8"
-_latin_l_extendedD = r"\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D" \
+)
-                         r"\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759" \
+_latin_l_extendedD = (
-                         r"\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F\uA771-\uA778\uA77A" \
+    r"\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D"
-                         r"\uA77C\uA77F\uA781\uA783\uA785\uA787\uA78C\uA78E\uA791\uA793-\uA795\uA797\uA799\uA79B" \
+    r"\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759"
    r"\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F\uA771-\uA778\uA77A"
    r"\uA77C\uA77F\uA781\uA783\uA785\uA787\uA78C\uA78E\uA791\uA793-\uA795\uA797\uA799\uA79B"
    r"\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AF\uA7B5\uA7B7\uA7B9\uA7FA"
 )
 _latin_extendedD = r"\uA722-\uA76F\uA771-\uA787\uA78B-\uA78E\uA790-\uA7B9\uA7FA"
 # special characters - phonetic Teuthonista and Sakha
@ -81,42 +97,80 @@ _latin_l_phonetic = r"\u0250-\u02AF\u1D00-\u1D25\u1D6B-\u1D77\u1D79-\u1D9A"
 _latin_phonetic = _latin_l_phonetic
 # letters with multiple diacritics - Vietnamese
-_latin_u_diacritics = r"\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A" \
+_latin_u_diacritics = (
-                          r"\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36" \
+    r"\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A"
-                          r"\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52" \
+    r"\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36"
-                          r"\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E" \
+    r"\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52"
-                          r"\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A" \
+    r"\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E"
-                          r"\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE" \
+    r"\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A"
-                          r"\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8" \
+    r"\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE"
-                          r"\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4" \
+    r"\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8"
    r"\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4"
    r"\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE"
-_latin_l_diacritics = r"\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B" \
+)
-                          r"\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37" \
+_latin_l_diacritics = (
-                          r"\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53" \
+    r"\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B"
-                          r"\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F" \
+    r"\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37"
-                          r"\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B" \
+    r"\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53"
-                          r"\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD" \
+    r"\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F"
-                          r"\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9" \
+    r"\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B"
-                          r"\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5" \
+    r"\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD"
    r"\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9"
    r"\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5"
    r"\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF"
 )
 _latin_diacritics = r"\u1E00-\u1EFF"
 # all lower latin classes
-LATIN_LOWER_BASIC = _latin_l_standard + _latin_l_standard_fullwidth + _latin_l_supplement + _latin_l_extendedA
+LATIN_LOWER_BASIC = (
-LATIN_LOWER = LATIN_LOWER_BASIC + _latin_l_extendedB + _latin_l_extendedC + _latin_l_extendedD + _latin_l_extendedE \
+    _latin_l_standard
-               + _latin_l_phonetic + _latin_l_diacritics
+    + _latin_l_standard_fullwidth
    + _latin_l_supplement
    + _latin_l_extendedA
 )
 LATIN_LOWER = (
    LATIN_LOWER_BASIC
    + _latin_l_extendedB
    + _latin_l_extendedC
    + _latin_l_extendedD
    + _latin_l_extendedE
    + _latin_l_phonetic
    + _latin_l_diacritics
 )
 # all upper latin classes
-LATIN_UPPER_BASIC = _latin_u_standard + _latin_u_standard_fullwidth + _latin_u_supplement + _latin_u_extendedA
+LATIN_UPPER_BASIC = (
-LATIN_UPPER = LATIN_UPPER_BASIC + _latin_u_extendedB + _latin_u_extendedC + _latin_u_extendedD + _latin_u_diacritics
+    _latin_u_standard
    + _latin_u_standard_fullwidth
    + _latin_u_supplement
    + _latin_u_extendedA
 )
 LATIN_UPPER = (
    LATIN_UPPER_BASIC
    + _latin_u_extendedB
    + _latin_u_extendedC
    + _latin_u_extendedD
    + _latin_u_diacritics
 )
 # all latin classes
-LATIN_BASIC = _latin_standard + _latin_standard_fullwidth + _latin_supplement + _latin_extendedA
+LATIN_BASIC = (
-LATIN = LATIN_BASIC + _latin_extendedB + _latin_extendedC + _latin_extendedD + _latin_extendedE \
+    _latin_standard + _latin_standard_fullwidth + _latin_supplement + _latin_extendedA
-         + _latin_phonetic + _latin_diacritics
+)
 LATIN = (
    LATIN_BASIC
    + _latin_extendedB
    + _latin_extendedC
    + _latin_extendedD
    + _latin_extendedE
    + _latin_phonetic
    + _latin_diacritics
 )
-_persian = r"\u0620-\u064A\u066E-\u06D5\u06E5-\u06FF\u0750-\u077F\u08A0-\u08BD" \
+_persian = (
    r"\u0620-\u064A\u066E-\u06D5\u06E5-\u06FF\u0750-\u077F\u08A0-\u08BD"
    r"\uFB50-\uFBB1\uFBD3-\uFD3D\uFD50-\uFDC7\uFDF0-\uFDFB\uFE70-\uFEFC\U0001EE00-\U0001EEBB"
 )
 _russian_lower = r"ёа-я"
 _russian_upper = r"ЁА-Я"
@ -165,33 +219,35 @@ _hyphens = "- – — -- --- —— ~"
 # Various symbols like dingbats, but also emoji
 # Details: https://www.compart.com/en/unicode/category/So
-_other_symbols = r"\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70" \
+_other_symbols = (
-                 r"\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34" \
+    r"\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70"
-                 r"\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399" \
+    r"\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34"
-                 r"\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116" \
+    r"\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399"
-                 r"\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B" \
+    r"\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116"
-                 r"\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3" \
+    r"\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B"
-                 r"\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB" \
+    r"\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3"
-                 r"\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E" \
+    r"\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB"
-                 r"\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95" \
+    r"\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E"
-                 r"\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB" \
+    r"\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95"
-                 r"\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3" \
+    r"\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB"
-                 r"\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF" \
+    r"\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3"
-                 r"\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC" \
+    r"\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF"
-                 r"\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B" \
+    r"\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC"
-                 r"\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F" \
+    r"\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B"
-                 r"\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164" \
+    r"\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F"
-                 r"\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8" \
+    r"\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164"
-                 r"\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A" \
+    r"\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8"
-                 r"\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B" \
+    r"\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A"
-                 r"\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF" \
+    r"\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B"
-                 r"\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202" \
+    r"\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF"
-                 r"\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265" \
+    r"\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202"
-                 r"\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9" \
+    r"\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265"
-                 r"\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847" \
+    r"\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9"
-                 r"\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B" \
+    r"\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847"
-                 r"\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2" \
+    r"\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B"
    r"\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2"
    r"\U0001F9B0-\U0001F9B9\U0001F9C0-\U0001F9C2\U0001F9D0-\U0001F9FF\U0001FA60-\U0001FA6D"
 )
 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@ -19,6 +19,7 @@ cdef struct WeightsC:
    const float* feat_bias
    const float* hidden_bias
    const float* hidden_weights
    const float* seen_classes
 cdef struct ActivationsC:
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -44,8 +44,10 @@ cdef WeightsC get_c_weights(model) except *:
    output.feat_bias = <const float*>state2vec.bias.data
    cdef np.ndarray vec2scores_W = model.vec2scores.W
    cdef np.ndarray vec2scores_b = model.vec2scores.b
    cdef np.ndarray class_mask = model._class_mask
    output.hidden_weights = <const float*>vec2scores_W.data
    output.hidden_bias = <const float*>vec2scores_b.data
    output.seen_classes = <const float*>class_mask.data
    return output
@ -115,6 +117,16 @@ cdef void predict_states(ActivationsC* A, StateC** states,
    for i in range(n.states):
        VecVec.add_i(&A.scores[i*n.classes],
            W.hidden_bias, 1., n.classes)
    # Set unseen classes to minimum value
    i = 0
    min_ = A.scores[0]
    for i in range(1, n.states * n.classes):
        if A.scores[i] < min_:
            min_ = A.scores[i]
    for i in range(n.states):
        for j in range(n.classes):
            if not W.seen_classes[j]:
                A.scores[i*n.classes+j] = min_
 cdef void sum_state_features(float* output,
@ -189,12 +201,17 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
 class ParserModel(Model):
-    def __init__(self, tok2vec, lower_model, upper_model):
+    def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
        Model.__init__(self)
        self._layers = [tok2vec, lower_model, upper_model]
        self.unseen_classes = set()
        if unseen_classes:
            for class_ in unseen_classes:
                self.unseen_classes.add(class_)
    def begin_update(self, docs, drop=0.):
-        step_model = ParserStepModel(docs, self._layers, drop=drop)
+        step_model = ParserStepModel(docs, self._layers, drop=drop,
                        unseen_classes=self.unseen_classes)
        def finish_parser_update(golds, sgd=None):
            step_model.make_updates(sgd)
            return None
@ -207,9 +224,8 @@ class ParserModel(Model):
        with Model.use_device('cpu'):
            larger = Affine(new_output, smaller.nI)
-        # Set nan as value for unseen classes, to prevent prediction.
+        larger.W.fill(0.0)
-        larger.W.fill(self.ops.xp.nan)
+        larger.b.fill(0.0)
        larger.b.fill(self.ops.xp.nan)
        # It seems very unhappy if I pass these as smaller.W?
        # Seems to segfault. Maybe it's a descriptor protocol thing?
        smaller_W = smaller.W
@ -221,6 +237,8 @@ class ParserModel(Model):
        larger_W[:smaller.nO] = smaller_W
        larger_b[:smaller.nO] = smaller_b
        self._layers[-1] = larger
        for i in range(smaller.nO, new_output):
            self.unseen_classes.add(i)
    def begin_training(self, X, y=None):
        self.lower.begin_training(X, y=y)
@ -239,18 +257,32 @@ class ParserModel(Model):
 class ParserStepModel(Model):
-    def __init__(self, docs, layers, drop=0.):
+    def __init__(self, docs, layers, unseen_classes=None, drop=0.):
        self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
                                            drop=drop)
        self.vec2scores = layers[-1]
        self.cuda_stream = util.get_cuda_stream()
        self.backprops = []
        self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f')
        self._class_mask.fill(1)
        if unseen_classes is not None:
            for class_ in unseen_classes:
                self._class_mask[class_] = 0.
    @property
    def nO(self):
        return self.state2vec.nO
    def class_is_unseen(self, class_):
        return self._class_mask[class_]
    def mark_class_unseen(self, class_):
        self._class_mask[class_] = 0
    def mark_class_seen(self, class_):
        self._class_mask[class_] = 1
    def begin_update(self, states, drop=0.):
        token_ids = self.get_token_ids(states)
        vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
@ -258,24 +290,12 @@ class ParserStepModel(Model):
        if mask is not None:
            vector *= mask
        scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
-        # We can have nans from unseen classes.
+        # If the class is unseen, make sure its score is minimum
-        # For backprop purposes, we want to treat unseen classes as having the
+        scores[:, self._class_mask == 0] = numpy.nanmin(scores)
        # lowest score.
        # numpy's nan_to_num function doesn't take a value, and nan is replaced
        # by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
        # Note that scores is always a numpy array! Should fix #3112
        scores[numpy.isnan(scores)] = -numpy.inf
        numpy.nan_to_num(scores, copy=False)
        def backprop_parser_step(d_scores, sgd=None):
-            # If we have a non-zero gradient for a previously unseen class,
+            # Zero vectors for unseen classes
-            # replace the weight with 0.
+            d_scores *= self._class_mask
            new_classes = self.vec2scores.ops.xp.logical_and(
                self.vec2scores.ops.xp.isnan(self.vec2scores.b),
                d_scores.any(axis=0)
            )
            self.vec2scores.b[new_classes] = 0.
            self.vec2scores.W[new_classes] = 0.
            d_vector = get_d_vector(d_scores, sgd=sgd)
            if mask is not None:
                d_vector *= mask
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -163,6 +163,8 @@ cdef class Parser:
            added = self.moves.add_action(action, label)
            if added:
                resized = True
        if resized and "nr_class" in self.cfg:
            self.cfg["nr_class"] = self.moves.n_moves
        if self.model not in (True, False, None) and resized:
            self.model.resize_output(self.moves.n_moves)
@ -435,22 +437,22 @@ cdef class Parser:
        if self._rehearsal_model is None:
            return None
        losses.setdefault(self.name, 0.)
        states = self.moves.init_batch(docs)
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
        self.model.resize_output(self.moves.n_moves)
        self._rehearsal_model.resize_output(self.moves.n_moves)
        # Prepare the stepwise model, and get the callback for finishing the batch
-        tutor = self._rehearsal_model(docs)
+        tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
        model, finish_update = self.model.begin_update(docs, drop=0.0)
        n_scores = 0.
        loss = 0.
        non_zeroed_classes = self._rehearsal_model.upper.W.any(axis=1)
        while states:
-            targets, _ = tutor.begin_update(states)
+            targets, _ = tutor.begin_update(states, drop=0.)
-            guesses, backprop = model.begin_update(states)
+            guesses, backprop = model.begin_update(states, drop=0.)
-            d_scores = (targets - guesses) / targets.shape[0]
+            d_scores = (guesses - targets) / targets.shape[0]
            d_scores *= non_zeroed_classes
            # If all weights for an output are 0 in the original model, don't
            # supervise that output. This allows us to add classes.
            loss += (d_scores**2).sum()
@ -543,6 +545,9 @@ cdef class Parser:
            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
            memset(costs, 0, self.moves.n_moves * sizeof(float))
            self.moves.set_costs(is_valid, costs, state, gold)
            for j in range(self.moves.n_moves):
                if costs[j] <= 0.0 and j in self.model.unseen_classes:
                    self.model.unseen_classes.remove(j)
            cpu_log_loss(c_d_scores,
                costs, is_valid, &scores[i, 0], d_scores.shape[1])
            c_d_scores += d_scores.shape[1]
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -147,6 +147,8 @@ cdef class TransitionSystem:
    def initialize_actions(self, labels_by_action, min_freq=None):
        self.labels = {}
        self.n_moves = 0
        added_labels = []
        added_actions = {}
        for action, label_freqs in sorted(labels_by_action.items()):
            action = int(action)
            # Make sure we take a copy here, and that we get a Counter
@ -157,6 +159,15 @@ cdef class TransitionSystem:
            sorted_labels.sort()
            sorted_labels.reverse()
            for freq, label_str in sorted_labels:
                if freq < 0:
                    added_labels.append((freq, label_str))
                    added_actions.setdefault(label_str, []).append(action)
                else:
                    self.add_action(int(action), label_str)
                    self.labels[action][label_str] = freq
        added_labels.sort(reverse=True)
        for freq, label_str in added_labels:
            for action in added_actions[label_str]:
                self.add_action(int(action), label_str)
                self.labels[action][label_str] = freq
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -6,7 +6,6 @@ import pytest
 import numpy
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 from spacy.attrs import LEMMA
 from spacy.errors import ModelsWarning
 from ..util import get_doc
@ -139,81 +138,6 @@ def test_doc_api_set_ents(en_tokenizer):
    assert tokens.ents[0].end == 4
 def test_doc_api_merge(en_tokenizer):
    text = "WKRO played songs by the beach boys all night"
    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
    # merge both with bulk merge
    doc = en_tokenizer(text)
    assert len(doc) == 9
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7], attrs=attrs)
        retokenizer.merge(doc[7:9], attrs=attrs)
    assert len(doc) == 6
    assert doc[4].text == "the beach boys"
    assert doc[4].text_with_ws == "the beach boys "
    assert doc[4].tag_ == "NAMED"
    assert doc[5].text == "all night"
    assert doc[5].text_with_ws == "all night"
    assert doc[5].tag_ == "NAMED"
 def test_doc_api_merge_children(en_tokenizer):
    """Test that attachments work correctly after merging."""
    text = "WKRO played songs by the beach boys all night"
    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
    doc = en_tokenizer(text)
    assert len(doc) == 9
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7], attrs=attrs)
    for word in doc:
        if word.i < word.head.i:
            assert word in list(word.head.lefts)
        elif word.i > word.head.i:
            assert word in list(word.head.rights)
 def test_doc_api_merge_hang(en_tokenizer):
    text = "through North and South Carolina"
    doc = en_tokenizer(text)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
        retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
 def test_doc_api_retokenizer(en_tokenizer):
    doc = en_tokenizer("WKRO played songs by the beach boys all night")
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7])
    assert len(doc) == 7
    assert doc[4].text == "the beach boys"
 def test_doc_api_retokenizer_attrs(en_tokenizer):
    doc = en_tokenizer("WKRO played songs by the beach boys all night")
    # test both string and integer attributes and values
    attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7], attrs=attrs)
    assert len(doc) == 7
    assert doc[4].text == "the beach boys"
    assert doc[4].lemma_ == "boys"
    assert doc[4].ent_type_ == "ORG"
@pytest.mark.xfail
 def test_doc_api_retokenizer_lex_attrs(en_tokenizer):
    """Test that lexical attributes can be changed (see #2390)."""
    doc = en_tokenizer("WKRO played beach boys songs")
    assert not any(token.is_stop for token in doc)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
    assert doc[2].text == "beach boys"
    assert doc[2].lemma_ == "boys"
    assert doc[2].is_stop
    new_doc = Doc(doc.vocab, words=["beach boys"])
    assert new_doc[0].is_stop
 def test_doc_api_sents_empty_string(en_tokenizer):
    doc = en_tokenizer("")
    doc.is_parsed = True
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -1,14 +1,89 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy.attrs import LEMMA
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
 import pytest
 from ..util import get_doc
-def test_spans_merge_tokens(en_tokenizer):
+def test_doc_retokenize_merge(en_tokenizer):
    text = "WKRO played songs by the beach boys all night"
    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
    doc = en_tokenizer(text)
    assert len(doc) == 9
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7], attrs=attrs)
        retokenizer.merge(doc[7:9], attrs=attrs)
    assert len(doc) == 6
    assert doc[4].text == "the beach boys"
    assert doc[4].text_with_ws == "the beach boys "
    assert doc[4].tag_ == "NAMED"
    assert doc[5].text == "all night"
    assert doc[5].text_with_ws == "all night"
    assert doc[5].tag_ == "NAMED"
 def test_doc_retokenize_merge_children(en_tokenizer):
    """Test that attachments work correctly after merging."""
    text = "WKRO played songs by the beach boys all night"
    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
    doc = en_tokenizer(text)
    assert len(doc) == 9
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7], attrs=attrs)
    for word in doc:
        if word.i < word.head.i:
            assert word in list(word.head.lefts)
        elif word.i > word.head.i:
            assert word in list(word.head.rights)
 def test_doc_retokenize_merge_hang(en_tokenizer):
    text = "through North and South Carolina"
    doc = en_tokenizer(text)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
        retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
 def test_doc_retokenize_retokenizer(en_tokenizer):
    doc = en_tokenizer("WKRO played songs by the beach boys all night")
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7])
    assert len(doc) == 7
    assert doc[4].text == "the beach boys"
 def test_doc_retokenize_retokenizer_attrs(en_tokenizer):
    doc = en_tokenizer("WKRO played songs by the beach boys all night")
    # test both string and integer attributes and values
    attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7], attrs=attrs)
    assert len(doc) == 7
    assert doc[4].text == "the beach boys"
    assert doc[4].lemma_ == "boys"
    assert doc[4].ent_type_ == "ORG"
@pytest.mark.xfail
 def test_doc_retokenize_lex_attrs(en_tokenizer):
    """Test that lexical attributes can be changed (see #2390)."""
    doc = en_tokenizer("WKRO played beach boys songs")
    assert not any(token.is_stop for token in doc)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
    assert doc[2].text == "beach boys"
    assert doc[2].lemma_ == "boys"
    assert doc[2].is_stop
    new_doc = Doc(doc.vocab, words=["beach boys"])
    assert new_doc[0].is_stop
 def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
    text = "Los Angeles start."
    heads = [1, 1, 0, -1]
    tokens = en_tokenizer(text)
@ -25,7 +100,7 @@ def test_spans_merge_tokens(en_tokenizer):
    assert doc[0].ent_type_ == "GPE"
-def test_spans_merge_heads(en_tokenizer):
+def test_doc_retokenize_spans_merge_heads(en_tokenizer):
    text = "I found a pilates class near work."
    heads = [1, 0, 2, 1, -3, -1, -1, -6]
    tokens = en_tokenizer(text)
@ -43,7 +118,7 @@ def test_spans_merge_heads(en_tokenizer):
    assert doc[5].head.i == 4
-def test_spans_merge_non_disjoint(en_tokenizer):
+def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
    text = "Los Angeles start."
    doc = en_tokenizer(text)
    with pytest.raises(ValueError):
@ -58,7 +133,7 @@ def test_spans_merge_non_disjoint(en_tokenizer):
            )
-def test_span_np_merges(en_tokenizer):
+def test_doc_retokenize_span_np_merges(en_tokenizer):
    text = "displaCy is a parse tool built with Javascript"
    heads = [1, 0, 2, 1, -3, -1, -1, -1]
    tokens = en_tokenizer(text)
@ -87,7 +162,7 @@ def test_span_np_merges(en_tokenizer):
            retokenizer.merge(ent)
-def test_spans_entity_merge(en_tokenizer):
+def test_doc_retokenize_spans_entity_merge(en_tokenizer):
    # fmt: off
    text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
    heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
@ -108,7 +183,7 @@ def test_spans_entity_merge(en_tokenizer):
    assert len(doc) == 15
-def test_spans_entity_merge_iob():
+def test_doc_retokenize_spans_entity_merge_iob():
    # Test entity IOB stays consistent after merging
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
@ -147,7 +222,7 @@ def test_spans_entity_merge_iob():
    assert doc[4].ent_iob_ == "I"
-def test_spans_sentence_update_after_merge(en_tokenizer):
+def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
    # fmt: off
    text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
    heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
@ -155,7 +230,6 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
            'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
            'compound', 'dobj', 'punct']
    # fmt: on
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    sent1, sent2 = list(doc.sents)
@ -169,7 +243,7 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
    assert len(sent2) == init_len2 - 1
-def test_spans_subtree_size_check(en_tokenizer):
+def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
    # fmt: off
    text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
    heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
@ -177,7 +251,6 @@ def test_spans_subtree_size_check(en_tokenizer):
            "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
            "dobj"]
    # fmt: on
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    sent1 = list(doc.sents)[0]
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@ -8,7 +8,7 @@ from spacy.tokens import Doc
 from ..util import get_doc
-def test_doc_split(en_vocab):
+def test_doc_retokenize_split(en_vocab):
    words = ["LosAngeles", "start", "."]
    heads = [1, 1, 0]
    doc = get_doc(en_vocab, words=words, heads=heads)
@ -41,7 +41,7 @@ def test_doc_split(en_vocab):
    assert len(str(doc)) == 19
-def test_split_dependencies(en_vocab):
+def test_doc_retokenize_split_dependencies(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    dep1 = doc.vocab.strings.add("amod")
    dep2 = doc.vocab.strings.add("subject")
@ -56,7 +56,7 @@ def test_split_dependencies(en_vocab):
    assert doc[1].dep == dep2
-def test_split_heads_error(en_vocab):
+def test_doc_retokenize_split_heads_error(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    # Not enough heads
    with pytest.raises(ValueError):
@ -69,7 +69,7 @@ def test_split_heads_error(en_vocab):
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
-def test_spans_entity_merge_iob():
+def test_doc_retokenize_spans_entity_split_iob():
    # Test entity IOB stays consistent after merging
    words = ["abc", "d", "e"]
    doc = Doc(Vocab(), words=words)
@ -84,7 +84,7 @@ def test_spans_entity_merge_iob():
    assert doc[3].ent_iob_ == "I"
-def test_spans_sentence_update_after_merge(en_vocab):
+def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
    # fmt: off
    words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
             "lives", "in", "England", "and", "loves", "JoePasquale", "."]
@ -114,7 +114,7 @@ def test_spans_sentence_update_after_merge(en_vocab):
    assert len(sent2) == init_len2 + 1
-def test_split_orths_mismatch(en_vocab):
+def test_doc_retokenize_split_orths_mismatch(en_vocab):
    """Test that the regular retokenizer.split raises an error if the orths
    don't match the original token text. There might still be a method that
    allows this, but for the default use cases, merging and splitting should
--- a/spacy/tests/regression/test_issue1971.py
+++ b/spacy/tests/regression/test_issue1971.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 from spacy.matcher import Matcher
 from spacy.tokens import Token, Doc
@ -28,7 +27,7 @@ def test_issue1971(en_vocab):
 def test_issue_1971_2(en_vocab):
    matcher = Matcher(en_vocab)
    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
-    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] #{"IN": ["EUR"]}}]
+    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
    matcher.add("TEST1", None, pattern1, pattern2)
    matches = matcher(doc)
@ -59,6 +58,5 @@ def test_issue_1971_4(en_vocab):
    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
    matcher.add("TEST", None, pattern)
    matches = matcher(doc)
-    # Interesting: uncommenting this causes a segmentation fault, so there's
+    # Uncommenting this caused a segmentation fault
-    # definitely something going on here
+    assert len(matches) == 1
    # assert len(matches) == 1
--- a/spacy/tests/regression/test_issue3288.py
+++ b/spacy/tests/regression/test_issue3288.py
@ -1,7 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 import numpy
 from spacy import displacy
--- a/spacy/tests/regression/test_issue3289.py
+++ b/spacy/tests/regression/test_issue3289.py
@ -1,7 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy.lang.en import English
--- a/spacy/util.py
+++ b/spacy/util.py
@ -315,6 +315,11 @@ def read_regex(path):
 def compile_prefix_regex(entries):
    """Compile a list of prefix rules into a regex object.
    entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES.
    RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search.
    """
    if "(" in entries:
        # Handle deprecated data
        expression = "|".join(
@ -327,11 +332,21 @@ def compile_prefix_regex(entries):
 def compile_suffix_regex(entries):
    """Compile a list of suffix rules into a regex object.
    entries (tuple): The suffix rules, e.g. spacy.lang.punctuation.TOKENIZER_SUFFIXES.
    RETURNS (regex object): The regex object. to be used for Tokenizer.suffix_search.
    """
    expression = "|".join([piece + "$" for piece in entries if piece.strip()])
    return re.compile(expression)
 def compile_infix_regex(entries):
    """Compile a list of infix rules into a regex object.
    entries (tuple): The infix rules, e.g. spacy.lang.punctuation.TOKENIZER_INFIXES.
    RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
    """
    expression = "|".join([piece for piece in entries if piece.strip()])
    return re.compile(expression)
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -504,6 +504,57 @@ an error if key doesn't match `ORTH` values.
 | `*addition_dicts` | dicts | Exception dictionaries to add to the base exceptions, in order. |
 | **RETURNS**       | dict  | Combined tokenizer exceptions.                                  |
 ### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"}
 Compile a sequence of prefix rules into a regex object.
 > #### Example
 >
 > ```python
 > prefixes = ("§", "%", "=", r"\+")
 > prefix_regex = util.compile_prefix_regex(prefixes)
 > nlp.tokenizer.prefix_search = prefix_regex.search
 > ```
 | Name        | Type                                                          | Description                                                                                                                               |
 | ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | tuple                                                         | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
 | **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes).                                                  |
 ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
 Compile a sequence of suffix rules into a regex object.
 > #### Example
 >
 > ```python
 > suffixes = ("'s", "'S", r"(?<=[0-9])\+")
 > suffix_regex = util.compile_suffix_regex(suffixes)
 > nlp.tokenizer.suffix_search = suffix_regex.search
 > ```
 | Name        | Type                                                          | Description                                                                                                                               |
 | ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | tuple                                                         | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
 | **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes).                                                  |
 ### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
 Compile a sequence of infix rules into a regex object.
 > #### Example
 >
 > ```python
 > infixes = ("…", "-", "—", r"(?<=[0-9])[+\-\*^](?=[0-9-])")
 > infix_regex = util.compile_infix_regex(infixes)
 > nlp.tokenizer.infix_finditer = infix_regex.finditer
 > ```
 | Name        | Type                                                          | Description                                                                                                                             |
 | ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | tuple                                                         | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
 | **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes).                                               |
 ### util.minibatch {#util.minibatch tag="function" new="2"}
 Iterate over batches of items. `size` may be an iterator, so that batch-size can
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -812,6 +812,40 @@ only be applied at the **end of a token**, so your expression should end with a
 </Infobox>
 #### Adding to existing rule sets {#native-tokenizer-additions}
 In many situations, you don't necessarily need entirely custom rules. Sometimes
 you just want to add another character to the prefixes, suffixes or infixes. The
 default prefix, suffix and infix rules are available via the `nlp` object's
 `Defaults` and the [`Tokenizer.suffix_search`](/api/tokenizer#attributes)
 attribute is writable, so you can overwrite it with a compiled regular
 expression object using of the modified default rules. spaCy ships with utility
 functions to help you compile the regular expressions – for example,
 [`compile_suffix_regex`](/api/top-level#util.compile_suffix_regex):
 ```python
 suffixes = nlp.Defaults.suffixes + (r'''-+$''',)
 suffix_regex = spacy.util.compile_suffix_regex(suffixes)
 nlp.tokenizer.suffix_search = suffix_regex.search
 ```
 For an overview of the default regular expressions, see
 [`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py).
 The `Tokenizer.suffix_search` attribute should be a function which takes a
 unicode string and returns a **regex match object** or `None`. Usually we use
 the `.search` attribute of a compiled regex object, but you can use some other
 function that behaves the same way.
 <Infobox title="Important note" variant="warning">
 If you're using a statistical model, writing to the `nlp.Defaults` or
 `English.Defaults` directly won't work, since the regular expressions are read
 from the model and will be compiled when you load it. You'll only see the effect
 if you call [`spacy.blank`](/api/top-level#spacy.blank) or
 `Defaults.create_tokenizer()`.
 </Infobox>
 ### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer}
 The tokenizer is the first component of the processing pipeline and the only one