Merge branch 'master' into spacy.io

2025-12-06 09:44:21 +03:00 · 2019-09-04 17:11:57 +02:00 · 2019-09-04 17:11:57 +02:00 · efd1c9d9f5
commit efd1c9d9f5
parent e055977851 2245e95e2d
27 changed files with 2072 additions and 53 deletions
--- a/.github/contributors/Schibsted.png
+++ b/.github/contributors/Schibsted.png
--- a/examples/keras_parikh_entailment/spacy_hook.py
+++ b/examples/keras_parikh_entailment/spacy_hook.py
@ -12,15 +12,15 @@ class KerasSimilarityShim(object):
    @classmethod
    def load(cls, path, nlp, max_length=100, get_features=None):
-        
+
        if get_features is None:
            get_features = get_word_ids
-            
+
-        with (path / 'config.json').open() as file_:
+        with (path / "config.json").open() as file_:
            model = model_from_json(file_.read())
-        with (path / 'model').open('rb') as file_:
+        with (path / "model").open("rb") as file_:
            weights = pickle.load(file_)
-            
+
        embeddings = get_embeddings(nlp.vocab)
        weights.insert(1, embeddings)
        model.set_weights(weights)
@ -33,8 +33,8 @@ class KerasSimilarityShim(object):
        self.max_length = max_length
    def __call__(self, doc):
-        doc.user_hooks['similarity'] = self.predict
+        doc.user_hooks["similarity"] = self.predict
-        doc.user_span_hooks['similarity'] = self.predict
+        doc.user_span_hooks["similarity"] = self.predict
        return doc
@ -48,24 +48,24 @@ class KerasSimilarityShim(object):
 def get_embeddings(vocab, nr_unk=100):
    # the extra +1 is for a zero vector representing sentence-final padding
-    num_vectors = max(lex.rank for lex in vocab) + 2 
+    num_vectors = max(lex.rank for lex in vocab) + 2
-    
+
    # create random vectors for OOV tokens
    oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
    oov = oov / oov.sum(axis=1, keepdims=True)
-    
+
-    vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype='float32')
+    vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype="float32")
-    vectors[1:(nr_unk + 1), ] = oov
+    vectors[1 : (nr_unk + 1),] = oov
    for lex in vocab:
        if lex.has_vector and lex.vector_norm > 0:
-            vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm 
+            vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
    return vectors
 def get_word_ids(docs, max_length=100, nr_unk=100):
-    Xs = np.zeros((len(docs), max_length), dtype='int32')
+    Xs = np.zeros((len(docs), max_length), dtype="int32")
-    
+
    for i, doc in enumerate(docs):
        for j, token in enumerate(doc):
            if j == max_length:
--- a/examples/training/ner_example_data/README.md
+++ b/examples/training/ner_example_data/README.md
@ -0,0 +1,7 @@
 ## Examples of NER/IOB data that can be converted with `spacy convert`
 spacy JSON training files were generated with:
 ```
 python -m spacy convert -c iob -s -n 10 -b en file.iob
 ```
--- a/examples/training/ner_example_data/ner-sent-per-line.iob
+++ b/examples/training/ner_example_data/ner-sent-per-line.iob
@ -0,0 +1,2 @@
 When|WRB|O Sebastian|NNP|B-PERSON Thrun|NNP|I-PERSON started|VBD|O working|VBG|O on|IN|O self|NN|O -|HYPH|O driving|VBG|O cars|NNS|O at|IN|O Google|NNP|B-ORG in|IN|O 2007|CD|B-DATE ,|,|O few|JJ|O people|NNS|O outside|RB|O of|IN|O the|DT|O company|NN|O took|VBD|O him|PRP|O seriously|RB|O .|.|O
 “|''|O I|PRP|O can|MD|O tell|VB|O you|PRP|O very|RB|O senior|JJ|O CEOs|NNS|O of|IN|O major|JJ|O American|JJ|B-NORP car|NN|O companies|NNS|O would|MD|O shake|VB|O my|PRP$|O hand|NN|O and|CC|O turn|VB|O away|RB|O because|IN|O I|PRP|O was|VBD|O n’t|RB|O worth|JJ|O talking|VBG|O to|IN|O ,|,|O ”|''|O said|VBD|O Thrun|NNP|B-PERSON ,|,|O in|IN|O an|DT|O interview|NN|O with|IN|O Recode|NNP|B-ORG earlier|RBR|B-DATE this|DT|I-DATE week|NN|I-DATE .|.|O
--- a/examples/training/ner_example_data/ner-sent-per-line.json
+++ b/examples/training/ner_example_data/ner-sent-per-line.json
@ -0,0 +1,349 @@
 [
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"When",
                "tag":"WRB",
                "ner":"O"
              },
              {
                "orth":"Sebastian",
                "tag":"NNP",
                "ner":"B-PERSON"
              },
              {
                "orth":"Thrun",
                "tag":"NNP",
                "ner":"L-PERSON"
              },
              {
                "orth":"started",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"working",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"on",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"self",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"-",
                "tag":"HYPH",
                "ner":"O"
              },
              {
                "orth":"driving",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"cars",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"at",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"Google",
                "tag":"NNP",
                "ner":"U-ORG"
              },
              {
                "orth":"in",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"2007",
                "tag":"CD",
                "ner":"U-DATE"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"few",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"people",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"outside",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"of",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"the",
                "tag":"DT",
                "ner":"O"
              },
              {
                "orth":"company",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"took",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"him",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"seriously",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":".",
                "tag":".",
                "ner":"O"
              }
            ]
          },
          {
            "tokens":[
              {
                "orth":"\u201c",
                "tag":"''",
                "ner":"O"
              },
              {
                "orth":"I",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"can",
                "tag":"MD",
                "ner":"O"
              },
              {
                "orth":"tell",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"you",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"very",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"senior",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"CEOs",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"of",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"major",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"American",
                "tag":"JJ",
                "ner":"U-NORP"
              },
              {
                "orth":"car",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"companies",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"would",
                "tag":"MD",
                "ner":"O"
              },
              {
                "orth":"shake",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"my",
                "tag":"PRP$",
                "ner":"O"
              },
              {
                "orth":"hand",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"and",
                "tag":"CC",
                "ner":"O"
              },
              {
                "orth":"turn",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"away",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"because",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"I",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"was",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"n\u2019t",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"worth",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"talking",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"to",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"\u201d",
                "tag":"''",
                "ner":"O"
              },
              {
                "orth":"said",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"Thrun",
                "tag":"NNP",
                "ner":"U-PERSON"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"in",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"an",
                "tag":"DT",
                "ner":"O"
              },
              {
                "orth":"interview",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"with",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"Recode",
                "tag":"NNP",
                "ner":"U-ORG"
              },
              {
                "orth":"earlier",
                "tag":"RBR",
                "ner":"B-DATE"
              },
              {
                "orth":"this",
                "tag":"DT",
                "ner":"I-DATE"
              },
              {
                "orth":"week",
                "tag":"NN",
                "ner":"L-DATE"
              },
              {
                "orth":".",
                "tag":".",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  }
 ]
--- a/examples/training/ner_example_data/ner-token-per-line-conll2003.iob
+++ b/examples/training/ner_example_data/ner-token-per-line-conll2003.iob
@ -0,0 +1,70 @@
 -DOCSTART- -X- O O
 When WRB _ O
 Sebastian NNP _ B-PERSON
 Thrun NNP _ I-PERSON
 started VBD _ O
 working VBG _ O
 on IN _ O
 self NN _ O
 - HYPH _ O
 driving VBG _ O
 cars NNS _ O
 at IN _ O
 Google NNP _ B-ORG
 in IN _ O
 2007 CD _ B-DATE
 , , _ O
 few JJ _ O
 people NNS _ O
 outside RB _ O
 of IN _ O
 the DT _ O
 company NN _ O
 took VBD _ O
 him PRP _ O
 seriously RB _ O
 . . _ O
 “ '' _ O
 I PRP _ O
 can MD _ O
 tell VB _ O
 you PRP _ O
 very RB _ O
 senior JJ _ O
 CEOs NNS _ O
 of IN _ O
 major JJ _ O
 American JJ _ B-NORP
 car NN _ O
 companies NNS _ O
 would MD _ O
 shake VB _ O
 my PRP$ _ O
 hand NN _ O
 and CC _ O
 turn VB _ O
 away RB _ O
 because IN _ O
 I PRP _ O
 was VBD _ O
 n’t RB _ O
 worth JJ _ O
 talking VBG _ O
 to IN _ O
 , , _ O
 ” '' _ O
 said VBD _ O
 Thrun NNP _ B-PERSON
 , , _ O
 in IN _ O
 an DT _ O
 interview NN _ O
 with IN _ O
 Recode NNP _ B-ORG
 earlier RBR _ B-DATE
 this DT _ I-DATE
 week NN _ I-DATE
 . . _ O
--- a/examples/training/ner_example_data/ner-token-per-line-conll2003.json
+++ b/examples/training/ner_example_data/ner-token-per-line-conll2003.json
@ -0,0 +1,349 @@
 [
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"When",
                "tag":"WRB",
                "ner":"O"
              },
              {
                "orth":"Sebastian",
                "tag":"NNP",
                "ner":"B-PERSON"
              },
              {
                "orth":"Thrun",
                "tag":"NNP",
                "ner":"L-PERSON"
              },
              {
                "orth":"started",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"working",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"on",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"self",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"-",
                "tag":"HYPH",
                "ner":"O"
              },
              {
                "orth":"driving",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"cars",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"at",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"Google",
                "tag":"NNP",
                "ner":"U-ORG"
              },
              {
                "orth":"in",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"2007",
                "tag":"CD",
                "ner":"U-DATE"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"few",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"people",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"outside",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"of",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"the",
                "tag":"DT",
                "ner":"O"
              },
              {
                "orth":"company",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"took",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"him",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"seriously",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":".",
                "tag":".",
                "ner":"O"
              }
            ]
          },
          {
            "tokens":[
              {
                "orth":"\u201c",
                "tag":"''",
                "ner":"O"
              },
              {
                "orth":"I",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"can",
                "tag":"MD",
                "ner":"O"
              },
              {
                "orth":"tell",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"you",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"very",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"senior",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"CEOs",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"of",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"major",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"American",
                "tag":"JJ",
                "ner":"U-NORP"
              },
              {
                "orth":"car",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"companies",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"would",
                "tag":"MD",
                "ner":"O"
              },
              {
                "orth":"shake",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"my",
                "tag":"PRP$",
                "ner":"O"
              },
              {
                "orth":"hand",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"and",
                "tag":"CC",
                "ner":"O"
              },
              {
                "orth":"turn",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"away",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"because",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"I",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"was",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"n\u2019t",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"worth",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"talking",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"to",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"\u201d",
                "tag":"''",
                "ner":"O"
              },
              {
                "orth":"said",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"Thrun",
                "tag":"NNP",
                "ner":"U-PERSON"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"in",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"an",
                "tag":"DT",
                "ner":"O"
              },
              {
                "orth":"interview",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"with",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"Recode",
                "tag":"NNP",
                "ner":"U-ORG"
              },
              {
                "orth":"earlier",
                "tag":"RBR",
                "ner":"B-DATE"
              },
              {
                "orth":"this",
                "tag":"DT",
                "ner":"I-DATE"
              },
              {
                "orth":"week",
                "tag":"NN",
                "ner":"L-DATE"
              },
              {
                "orth":".",
                "tag":".",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  }
 ]
--- a/examples/training/ner_example_data/ner-token-per-line-with-pos.iob
+++ b/examples/training/ner_example_data/ner-token-per-line-with-pos.iob
@ -0,0 +1,66 @@
 When	WRB	O
 Sebastian	NNP	B-PERSON
 Thrun	NNP	I-PERSON
 started	VBD	O
 working	VBG	O
 on	IN	O
 self	NN	O
 -	HYPH	O
 driving	VBG	O
 cars	NNS	O
 at	IN	O
 Google	NNP	B-ORG
 in	IN	O
 2007	CD	B-DATE
 ,	,	O
 few	JJ	O
 people	NNS	O
 outside	RB	O
 of	IN	O
 the	DT	O
 company	NN	O
 took	VBD	O
 him	PRP	O
 seriously	RB	O
 .	.	O
 “	''	O
 I	PRP	O
 can	MD	O
 tell	VB	O
 you	PRP	O
 very	RB	O
 senior	JJ	O
 CEOs	NNS	O
 of	IN	O
 major	JJ	O
 American	JJ	B-NORP
 car	NN	O
 companies	NNS	O
 would	MD	O
 shake	VB	O
 my	PRP$	O
 hand	NN	O
 and	CC	O
 turn	VB	O
 away	RB	O
 because	IN	O
 I	PRP	O
 was	VBD	O
 n’t	RB	O
 worth	JJ	O
 talking	VBG	O
 to	IN	O
 ,	,	O
 ”	''	O
 said	VBD	O
 Thrun	NNP	B-PERSON
 ,	,	O
 in	IN	O
 an	DT	O
 interview	NN	O
 with	IN	O
 Recode	NNP	B-ORG
 earlier	RBR	B-DATE
 this	DT	I-DATE
 week	NN	I-DATE
 .	.	O
--- a/examples/training/ner_example_data/ner-token-per-line-with-pos.json
+++ b/examples/training/ner_example_data/ner-token-per-line-with-pos.json
@ -0,0 +1,353 @@
 [
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"When",
                "tag":"WRB",
                "ner":"O"
              },
              {
                "orth":"Sebastian",
                "tag":"NNP",
                "ner":"B-PERSON"
              },
              {
                "orth":"Thrun",
                "tag":"NNP",
                "ner":"L-PERSON"
              },
              {
                "orth":"started",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"working",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"on",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"self",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"-",
                "tag":"HYPH",
                "ner":"O"
              },
              {
                "orth":"driving",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"cars",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"at",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"Google",
                "tag":"NNP",
                "ner":"U-ORG"
              },
              {
                "orth":"in",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"2007",
                "tag":"CD",
                "ner":"U-DATE"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"few",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"people",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"outside",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"of",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"the",
                "tag":"DT",
                "ner":"O"
              },
              {
                "orth":"company",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"took",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"him",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"seriously",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":".",
                "tag":".",
                "ner":"O"
              }
            ]
          },
          {
            "tokens":[
              {
                "orth":"\u201c",
                "tag":"''",
                "ner":"O"
              }
            ]
          },
          {
            "tokens":[
              {
                "orth":"I",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"can",
                "tag":"MD",
                "ner":"O"
              },
              {
                "orth":"tell",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"you",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"very",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"senior",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"CEOs",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"of",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"major",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"American",
                "tag":"JJ",
                "ner":"U-NORP"
              },
              {
                "orth":"car",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"companies",
                "tag":"NNS",
                "ner":"O"
              },
              {
                "orth":"would",
                "tag":"MD",
                "ner":"O"
              },
              {
                "orth":"shake",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"my",
                "tag":"PRP$",
                "ner":"O"
              },
              {
                "orth":"hand",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"and",
                "tag":"CC",
                "ner":"O"
              },
              {
                "orth":"turn",
                "tag":"VB",
                "ner":"O"
              },
              {
                "orth":"away",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"because",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"I",
                "tag":"PRP",
                "ner":"O"
              },
              {
                "orth":"was",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"n\u2019t",
                "tag":"RB",
                "ner":"O"
              },
              {
                "orth":"worth",
                "tag":"JJ",
                "ner":"O"
              },
              {
                "orth":"talking",
                "tag":"VBG",
                "ner":"O"
              },
              {
                "orth":"to",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"\u201d",
                "tag":"''",
                "ner":"O"
              },
              {
                "orth":"said",
                "tag":"VBD",
                "ner":"O"
              },
              {
                "orth":"Thrun",
                "tag":"NNP",
                "ner":"U-PERSON"
              },
              {
                "orth":",",
                "tag":",",
                "ner":"O"
              },
              {
                "orth":"in",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"an",
                "tag":"DT",
                "ner":"O"
              },
              {
                "orth":"interview",
                "tag":"NN",
                "ner":"O"
              },
              {
                "orth":"with",
                "tag":"IN",
                "ner":"O"
              },
              {
                "orth":"Recode",
                "tag":"NNP",
                "ner":"U-ORG"
              },
              {
                "orth":"earlier",
                "tag":"RBR",
                "ner":"B-DATE"
              },
              {
                "orth":"this",
                "tag":"DT",
                "ner":"I-DATE"
              },
              {
                "orth":"week",
                "tag":"NN",
                "ner":"L-DATE"
              },
              {
                "orth":".",
                "tag":".",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  }
 ]
--- a/examples/training/ner_example_data/ner-token-per-line.iob
+++ b/examples/training/ner_example_data/ner-token-per-line.iob
@ -0,0 +1,66 @@
 When	O
 Sebastian	B-PERSON
 Thrun	I-PERSON
 started	O
 working	O
 on	O
 self	O
 -	O
 driving	O
 cars	O
 at	O
 Google	B-ORG
 in	O
 2007	B-DATE
 ,	O
 few	O
 people	O
 outside	O
 of	O
 the	O
 company	O
 took	O
 him	O
 seriously	O
 .	O
 “	O
 I	O
 can	O
 tell	O
 you	O
 very	O
 senior	O
 CEOs	O
 of	O
 major	O
 American	B-NORP
 car	O
 companies	O
 would	O
 shake	O
 my	O
 hand	O
 and	O
 turn	O
 away	O
 because	O
 I	O
 was	O
 n’t	O
 worth	O
 talking	O
 to	O
 ,	O
 ”	O
 said	O
 Thrun	B-PERSON
 ,	O
 in	O
 an	O
 interview	O
 with	O
 Recode	B-ORG
 earlier	B-DATE
 this	I-DATE
 week	I-DATE
 .	O
--- a/examples/training/ner_example_data/ner-token-per-line.json
+++ b/examples/training/ner_example_data/ner-token-per-line.json
@ -0,0 +1,353 @@
 [
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"When",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"Sebastian",
                "tag":"-",
                "ner":"B-PERSON"
              },
              {
                "orth":"Thrun",
                "tag":"-",
                "ner":"L-PERSON"
              },
              {
                "orth":"started",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"working",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"on",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"self",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"-",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"driving",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"cars",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"at",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"Google",
                "tag":"-",
                "ner":"U-ORG"
              },
              {
                "orth":"in",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"2007",
                "tag":"-",
                "ner":"U-DATE"
              },
              {
                "orth":",",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"few",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"people",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"outside",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"of",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"the",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"company",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"took",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"him",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"seriously",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":".",
                "tag":"-",
                "ner":"O"
              }
            ]
          },
          {
            "tokens":[
              {
                "orth":"\u201c",
                "tag":"-",
                "ner":"O"
              }
            ]
          },
          {
            "tokens":[
              {
                "orth":"I",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"can",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"tell",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"you",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"very",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"senior",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"CEOs",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"of",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"major",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"American",
                "tag":"-",
                "ner":"U-NORP"
              },
              {
                "orth":"car",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"companies",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"would",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"shake",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"my",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"hand",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"and",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"turn",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"away",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"because",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"I",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"was",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"n\u2019t",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"worth",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"talking",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"to",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":",",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"\u201d",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"said",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"Thrun",
                "tag":"-",
                "ner":"U-PERSON"
              },
              {
                "orth":",",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"in",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"an",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"interview",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"with",
                "tag":"-",
                "ner":"O"
              },
              {
                "orth":"Recode",
                "tag":"-",
                "ner":"U-ORG"
              },
              {
                "orth":"earlier",
                "tag":"-",
                "ner":"B-DATE"
              },
              {
                "orth":"this",
                "tag":"-",
                "ner":"I-DATE"
              },
              {
                "orth":"week",
                "tag":"-",
                "ner":"L-DATE"
              },
              {
                "orth":".",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  }
 ]
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@ -80,7 +80,7 @@ def main(model_name, unlabelled_loc):
                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
            print("Losses", losses)
            print("R. Losses", r_losses)
-    print(nlp.get_pipe('ner').model.unseen_classes)
+    print(nlp.get_pipe("ner").model.unseen_classes)
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
@ -88,7 +88,5 @@ def main(model_name, unlabelled_loc):
        print(ent.label_, ent.text)
 if __name__ == "__main__":
    plac.call(main)
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -24,7 +24,7 @@ from spacy.util import minibatch, compounding
    output_dir=("Optional output directory", "option", "o", Path),
    n_texts=("Number of texts to train from", "option", "t", int),
    n_iter=("Number of training iterations", "option", "n", int),
-    init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path)
+    init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
 )
 def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
    if output_dir is not None:
@ -43,11 +43,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
-            "textcat",
+            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
            config={
                "exclusive_classes": True,
                "architecture": "simple_cnn",
            }
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -5,12 +5,14 @@ import plac
 from pathlib import Path
 from wasabi import Printer
 import srsly
 import re
 from .converters import conllu2json, iob2json, conll_ner2json
 from .converters import ner_jsonl2json
-# Converters are matched by file extension. To add a converter, add a new
+# Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
 CONVERTERS = {
@ -31,7 +33,9 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
    input_file=("Input file", "positional", None, str),
    output_dir=("Output directory. '-' for stdout.", "positional", None, str),
    file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
-    n_sents=("Number of sentences per doc", "option", "n", int),
+    n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
    seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
    model=("Model for sentence segmentation (for -s)", "option", "b", str),
    converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
    lang=("Language (if tokenizer required)", "option", "l", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool),
@ -41,6 +45,8 @@ def convert(
    output_dir="-",
    file_type="json",
    n_sents=1,
    seg_sents=False,
    model=None,
    morphology=False,
    converter="auto",
    lang=None,
@ -70,14 +76,33 @@ def convert(
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
        msg.fail("Output directory not found", output_dir, exits=1)
    input_data = input_path.open("r", encoding="utf-8").read()
    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter == "ner" or converter == "iob":
        converter_autodetect = autodetect_ner_format(input_data)
        if converter_autodetect == "ner":
            msg.info("Auto-detected token-per-line NER format")
            converter = converter_autodetect
        elif converter_autodetect == "iob":
            msg.info("Auto-detected sentence-per-line NER format")
            converter = converter_autodetect
        else:
            msg.warn(
                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
            )
    if converter not in CONVERTERS:
        msg.fail("Can't find converter for {}".format(converter), exits=1)
    # Use converter function to convert data
    func = CONVERTERS[converter]
-    input_data = input_path.open("r", encoding="utf-8").read()
+    data = func(
-    data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang)
+        input_data,
        n_sents=n_sents,
        seg_sents=seg_sents,
        use_morphology=morphology,
        lang=lang,
        model=model,
    )
    if output_dir != "-":
        # Export data to a file
        suffix = ".{}".format(file_type)
@ -88,10 +113,31 @@ def convert(
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
-        msg.good("Generated output file ({} documents)".format(len(data)), output_file)
+        msg.good(
            "Generated output file ({} documents): {}".format(len(data), output_file)
        )
    else:
        # Print to stdout
        if file_type == "json":
            srsly.write_json("-", data)
        elif file_type == "jsonl":
            srsly.write_jsonl("-", data)
 def autodetect_ner_format(input_data):
    # guess format from the first 20 lines
    lines = input_data.split("\n")[:20]
    format_guesses = {"ner": 0, "iob": 0}
    iob_re = re.compile(r"\S+\|(O|[IB]-\S+)")
    ner_re = re.compile(r"\S+\s+(O|[IB]-\S+)$")
    for line in lines:
        line = line.strip()
        if iob_re.search(line):
            format_guesses["iob"] += 1
        if ner_re.search(line):
            format_guesses["ner"] += 1
    if format_guesses["iob"] == 0 and format_guesses["ner"] > 0:
        return "ner"
    if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
        return "iob"
    return None
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -1,17 +1,89 @@
 # coding: utf8
 from __future__ import unicode_literals
 from wasabi import Printer
 from ...gold import iob_to_biluo
 from ...lang.xx import MultiLanguage
 from ...tokens.doc import Doc
 from ...util import load_model
-def conll_ner2json(input_data, **kwargs):
+def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs):
    """
-    Convert files in the CoNLL-2003 NER format into JSON format for use with
+    Convert files in the CoNLL-2003 NER format and similar
-    train cli.
+    whitespace-separated columns into JSON format for use with train cli.
    The first column is the tokens, the final column is the IOB tags. If an
    additional second column is present, the second column is the tags.
    Sentences are separated with whitespace and documents can be separated
    using the line "-DOCSTART- -X- O O".
    Sample format:
    -DOCSTART- -X- O O
    I O
    like O
    London B-GPE
    and O
    New B-GPE
    York I-GPE
    City I-GPE
    . O
    """
-    delimit_docs = "-DOCSTART- -X- O O"
+    msg = Printer()
    doc_delimiter = "-DOCSTART- -X- O O"
    # check for existing delimiters, which should be preserved
    if "\n\n" in input_data and seg_sents:
        msg.warn(
            "Sentence boundaries found, automatic sentence segmentation with "
            "`-s` disabled."
        )
        seg_sents = False
    if doc_delimiter in input_data and n_sents:
        msg.warn(
            "Document delimiters found, automatic document segmentation with "
            "`-n` disabled."
        )
        n_sents = 0
    # do document segmentation with existing sentences
    if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
        n_sents_info(msg, n_sents)
        input_data = segment_docs(input_data, n_sents, doc_delimiter)
    # do sentence segmentation with existing documents
    if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
        input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
    # do both sentence segmentation and document segmentation according
    # to options
    if "\n\n" not in input_data and doc_delimiter not in input_data:
        # sentence segmentation required for document segmentation
        if n_sents > 0 and not seg_sents:
            msg.warn(
                "No sentence boundaries found to use with option `-n {}`. "
                "Use `-s` to automatically segment sentences or `-n 0` "
                "to disable.".format(n_sents)
            )
        else:
            n_sents_info(msg, n_sents)
            input_data = segment_sents_and_docs(
                input_data, n_sents, doc_delimiter, model=model, msg=msg
            )
    # provide warnings for problematic data
    if "\n\n" not in input_data:
        msg.warn(
            "No sentence boundaries found. Use `-s` to automatically segment "
            "sentences."
        )
    if doc_delimiter not in input_data:
        msg.warn(
            "No document delimiters found. Use `-n` to automatically group "
            "sentences into documents."
        )
    output_docs = []
-    for doc in input_data.strip().split(delimit_docs):
+    for doc in input_data.strip().split(doc_delimiter):
        doc = doc.strip()
        if not doc:
            continue
@ -21,7 +93,19 @@ def conll_ner2json(input_data, **kwargs):
            if not sent:
                continue
            lines = [line.strip() for line in sent.split("\n") if line.strip()]
-            words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
+            cols = list(zip(*[line.split() for line in lines]))
            if len(cols) < 2:
                raise ValueError(
                    "The token-per-line NER file is not formatted correctly. "
                    "Try checking whitespace and delimiters. See "
                    "https://spacy.io/api/cli#convert"
                )
            words = cols[0]
            iob_ents = cols[-1]
            if len(cols) > 2:
                tags = cols[1]
            else:
                tags = ["-"] * len(words)
            biluo_ents = iob_to_biluo(iob_ents)
            output_doc.append(
                {
@ -36,3 +120,53 @@ def conll_ner2json(input_data, **kwargs):
        )
        output_doc = []
    return output_docs
 def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
    sentencizer = None
    if model:
        nlp = load_model(model)
        if "parser" in nlp.pipe_names:
            msg.info("Segmenting sentences with parser from model '{}'.".format(model))
            sentencizer = nlp.get_pipe("parser")
    if not sentencizer:
        msg.info(
            "Segmenting sentences with sentencizer. (Use `-b model` for "
            "improved parser-based sentence segmentation.)"
        )
        nlp = MultiLanguage()
        sentencizer = nlp.create_pipe("sentencizer")
    lines = doc.strip().split("\n")
    words = [line.strip().split()[0] for line in lines]
    nlpdoc = Doc(nlp.vocab, words=words)
    sentencizer(nlpdoc)
    lines_with_segs = []
    sent_count = 0
    for i, token in enumerate(nlpdoc):
        if token.is_sent_start:
            if n_sents and sent_count % n_sents == 0:
                lines_with_segs.append(doc_delimiter)
            lines_with_segs.append("")
            sent_count += 1
        lines_with_segs.append(lines[i])
    return "\n".join(lines_with_segs)
 def segment_docs(input_data, n_sents, doc_delimiter):
    sent_delimiter = "\n\n"
    sents = input_data.split(sent_delimiter)
    docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
    input_data = ""
    for doc in docs:
        input_data += sent_delimiter + doc_delimiter
        input_data += sent_delimiter.join(doc)
    return input_data
 def n_sents_info(msg, n_sents):
    msg.info("Grouping every {} sentences into a document.".format(n_sents))
    if n_sents == 1:
        msg.warn(
            "To generate better training data, you may want to group "
            "sentences into documents with `-n 10`."
        )
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -2,17 +2,30 @@
 from __future__ import unicode_literals
 import re
 from wasabi import Printer
 from ...gold import iob_to_biluo
 from ...util import minibatch
 from .conll_ner2json import n_sents_info
 def iob2json(input_data, n_sents=10, *args, **kwargs):
    """
-    Convert IOB files into JSON format for use with train cli.
+    Convert IOB files with one sentence per line and tags separated with '|'
    into JSON format for use with train cli. IOB and IOB2 are accepted.
    Sample formats:
    I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
    I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
    I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
    I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
    """
-    sentences = read_iob(input_data.split("\n"))
+    msg = Printer()
-    docs = merge_sentences(sentences, n_sents)
+    docs = read_iob(input_data.split("\n"))
    if n_sents > 0:
        n_sents_info(msg, n_sents)
        docs = merge_sentences(docs, n_sents)
    return docs
@ -21,7 +34,7 @@ def read_iob(raw_sents):
    for line in raw_sents:
        if not line.strip():
            continue
-        tokens = [re.split("[^\w\-]", line.strip())]
+        tokens = [t.split("|") for t in line.split()]
        if len(tokens[0]) == 3:
            words, pos, iob = zip(*tokens)
        elif len(tokens[0]) == 2:
@ -29,7 +42,7 @@ def read_iob(raw_sents):
            pos = ["-"] * len(words)
        else:
            raise ValueError(
-                "The iob/iob2 file is not formatted correctly. Try checking whitespace and delimiters."
+                "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
            )
        biluo = iob_to_biluo(iob)
        sentences.append(
@ -40,7 +53,7 @@ def read_iob(raw_sents):
        )
    sentences = [{"tokens": sent} for sent in sentences]
    paragraphs = [{"sentences": [sent]} for sent in sentences]
-    docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
+    docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
    return docs
@ -50,7 +63,7 @@ def merge_sentences(docs, n_sents):
        group = list(group)
        first = group.pop(0)
        to_extend = first["paragraphs"][0]["sentences"]
-        for sent in group[1:]:
+        for sent in group:
            to_extend.extend(sent["paragraphs"][0]["sentences"])
        merged.append(first)
    return merged
--- a/spacy/language.py
+++ b/spacy/language.py
@ -38,8 +38,8 @@ from . import about
 class BaseDefaults(object):
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
-        lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups)
+        rules, index, exc, lookup = util.get_lemma_tables(lookups)
-        return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
+        return Lemmatizer(index, exc, rules, lookup)
    @classmethod
    def create_lookups(cls, nlp=None):
--- a/spacy/matcher/_schemas.py
+++ b/spacy/matcher/_schemas.py
@ -142,10 +142,34 @@ TOKEN_PATTERN_SCHEMA = {
                "title": "Token is whitespace",
                "$ref": "#/definitions/boolean_value",
            },
            "IS_BRACKET": {
                "title": "Token is a bracket",
                "$ref": "#/definitions/boolean_value",
            },
            "IS_QUOTE": {
                "title": "Token is a quotation mark",
                "$ref": "#/definitions/boolean_value",
            },
            "IS_LEFT_PUNCT": {
                "title": "Token is a left punctuation mark",
                "$ref": "#/definitions/boolean_value",
            },
            "IS_RIGHT_PUNCT": {
                "title": "Token is a right punctuation mark",
                "$ref": "#/definitions/boolean_value",
            },
            "IS_CURRENCY": {
                "title": "Token is a currency symbol",
                "$ref": "#/definitions/boolean_value",
            },
            "IS_STOP": {
                "title": "Token is stop word",
                "$ref": "#/definitions/boolean_value",
            },
            "IS_SENT_START": {
                "title": "Token is the first in a sentence",
                "$ref": "#/definitions/boolean_value",
            },
            "LIKE_NUM": {
                "title": "Token resembles a number",
                "$ref": "#/definitions/boolean_value",
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -258,7 +258,7 @@ cdef class Begin:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
-        cdef int preset_ent_label = st.B_(0).ent_type
+        cdef attr_t preset_ent_label = st.B_(0).ent_type
        # If we're the last token of the input, we can't B -- must U or O.
        if st.B(1) == -1:
            return False
@ -395,6 +395,9 @@ cdef class Last:
            return False
        elif not st.entity_is_open():
            return False
        elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
            # If a preset entity has I followed by not-I, is L
            return True
        elif st.E_(0).ent_type != label:
            return False
        elif st.B_(1).ent_iob == 1:
--- a/spacy/tests/lang/sr/test_еxceptions.py
+++ b/spacy/tests/lang/sr/test_еxceptions.py
@ -6,8 +6,13 @@ import pytest
@pytest.mark.parametrize(
    "text,norms,lemmas",
-    [("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]),
+    [
-     ("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])])
+        ("о.г.", ["ове године"], ["ова година"]),
        ("чет.", ["четвртак"], ["четвртак"]),
        ("гђа", ["госпођа"], ["госпођа"]),
        ("ил'", ["или"], ["или"]),
    ],
 )
 def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
    tokens = sr_tokenizer(text)
    assert len(tokens) == 1
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -380,3 +380,33 @@ def test_attr_pipeline_checks(en_vocab):
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
@pytest.mark.parametrize(
    "pattern,text",
    [
        ([{"IS_ALPHA": True}], "a"),
        ([{"IS_ASCII": True}], "a"),
        ([{"IS_DIGIT": True}], "1"),
        ([{"IS_LOWER": True}], "a"),
        ([{"IS_UPPER": True}], "A"),
        ([{"IS_TITLE": True}], "Aaaa"),
        ([{"IS_PUNCT": True}], "."),
        ([{"IS_SPACE": True}], "\n"),
        ([{"IS_BRACKET": True}], "["),
        ([{"IS_QUOTE": True}], '"'),
        ([{"IS_LEFT_PUNCT": True}], "``"),
        ([{"IS_RIGHT_PUNCT": True}], "''"),
        ([{"IS_STOP": True}], "the"),
        ([{"LIKE_NUM": True}], "1"),
        ([{"LIKE_URL": True}], "http://example.com"),
        ([{"LIKE_EMAIL": True}], "mail@example.com"),
    ],
 )
 def test_matcher_schema_token_attributes(en_vocab, pattern, text):
    matcher = Matcher(en_vocab)
    doc = Doc(en_vocab, words=text.split(" "))
    matcher.add("Rule", None, pattern)
    assert len(matcher) == 1
    matches = matcher(doc)
    assert len(matches) == 1
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -329,3 +329,4 @@ def test_issue_1971_4(en_vocab):
    matches = matcher(doc)
    # Uncommenting this caused a segmentation fault
    assert len(matches) == 1
    assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@ -0,0 +1,57 @@
 # coding: utf8
 from __future__ import unicode_literals
 from spacy.lang.en import English
 import spacy
 from spacy.tokenizer import Tokenizer
 from spacy.tests.util import make_tempdir
 def test_issue4190():
    test_string = "Test c."
    # Load default language
    nlp_1 = English()
    doc_1a = nlp_1(test_string)
    result_1a = [token.text for token in doc_1a]
    # Modify tokenizer
    customize_tokenizer(nlp_1)
    doc_1b = nlp_1(test_string)
    result_1b = [token.text for token in doc_1b]
    # Save and Reload
    with make_tempdir() as model_dir:
        nlp_1.to_disk(model_dir)
        nlp_2 = spacy.load(model_dir)
    # This should be the modified tokenizer
    doc_2 = nlp_2(test_string)
    result_2 = [token.text for token in doc_2]
    assert result_1b == result_2
 def customize_tokenizer(nlp):
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
    # remove all exceptions where a single letter is followed by a period (e.g. 'h.')
    exceptions = {
        k: v
        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
        if not (len(k) == 2 and k[1] == ".")
    }
    new_tokenizer = Tokenizer(
        nlp.vocab,
        exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )
    nlp.tokenizer = new_tokenizer
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import pytest
 from spacy.lang.en import English
-from spacy.cli.converters import conllu2json
+from spacy.cli.converters import conllu2json, iob2json, conll_ner2json
 from spacy.cli.pretrain import make_docs
@ -32,6 +32,95 @@ def test_cli_converters_conllu2json():
    assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
 def test_cli_converters_iob2json():
    lines = [
        "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
        "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
        "I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
        "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
    ]
    input_data = "\n".join(lines)
    converted = iob2json(input_data, n_sents=10)
    assert len(converted) == 1
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
    for i in range(0, 4):
        sent = converted[0]["paragraphs"][0]["sentences"][i]
        assert len(sent["tokens"]) == 8
        tokens = sent["tokens"]
        # fmt: off
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
        # fmt: on
 def test_cli_converters_conll_ner2json():
    lines = [
        "-DOCSTART- -X- O O",
        "",
        "I\tO",
        "like\tO",
        "London\tB-GPE",
        "and\tO",
        "New\tB-GPE",
        "York\tI-GPE",
        "City\tI-GPE",
        ".\tO",
        "",
        "I O",
        "like O",
        "London B-GPE",
        "and O",
        "New B-GPE",
        "York I-GPE",
        "City I-GPE",
        ". O",
        "",
        "I PRP O",
        "like VBP O",
        "London NNP B-GPE",
        "and CC O",
        "New NNP B-GPE",
        "York NNP I-GPE",
        "City NNP I-GPE",
        ". . O",
        "",
        "I PRP _ O",
        "like VBP _ O",
        "London NNP _ B-GPE",
        "and CC _ O",
        "New NNP _ B-GPE",
        "York NNP _ I-GPE",
        "City NNP _ I-GPE",
        ". . _ O",
        "",
        "I\tPRP\t_\tO",
        "like\tVBP\t_\tO",
        "London\tNNP\t_\tB-GPE",
        "and\tCC\t_\tO",
        "New\tNNP\t_\tB-GPE",
        "York\tNNP\t_\tI-GPE",
        "City\tNNP\t_\tI-GPE",
        ".\t.\t_\tO",
    ]
    input_data = "\n".join(lines)
    converted = conll_ner2json(input_data, n_sents=10)
    print(converted)
    assert len(converted) == 1
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
    for i in range(0, 5):
        sent = converted[0]["paragraphs"][0]["sentences"][i]
        assert len(sent["tokens"]) == 8
        tokens = sent["tokens"]
        # fmt: off
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
        # fmt: on
 def test_pretrain_make_docs():
    nlp = English()
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -441,8 +441,13 @@ cdef class Tokenizer:
            self.infix_finditer = re.compile(data["infix_finditer"]).finditer
        if data.get("token_match"):
            self.token_match = re.compile(data["token_match"]).match
-        for string, substrings in data.get("rules", {}).items():
+        if data.get("rules"):
-            self.add_special_case(string, substrings)
+            # make sure to hard reset the cache to remove data from the default exceptions
            self._rules = {}
            self._cache = PreshMap()
            for string, substrings in data.get("rules", {}).items():
                self.add_special_case(string, substrings)
        return self
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -145,6 +145,8 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
 | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create (see below).                                                               |
 | `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                             |
 | `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                 |
 | `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag       | Segment sentences (for `-c ner`)                                                                  |
 | `--model`, `-b` <Tag variant="new">2.2</Tag>     | option     | Model for parser-based sentence segmentation (for `-s`)                                           |
 | `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                              |
 | `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                            |
 | `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                        |
@ -174,10 +176,10 @@ All output files generated by this command are compatible with
 | ID                             | Description                                                     |
 | ------------------------------ | --------------------------------------------------------------- |
-| `auto`                         | Automatically pick converter based on file extension (default). |
+| `auto`                         | Automatically pick converter based on file extension and file content (default). |
 | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format.            |
-| `ner`                          | Tab-based named entity recognition format.                      |
+| `ner`                          | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
-| `iob`                          | IOB or IOB2 named entity recognition format.                    |
+| `iob`                          | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
 ## Train {#train}
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -10,6 +10,7 @@
                "en_vectors_web_lg",
                "en_pytt_bertbaseuncased_lg",
                "en_pytt_robertabase_lg",
                "pytt_distilbertbaseuncased_lg",
                "en_pytt_xlnetbasecased_lg"
            ],
            "example": "This is a sentence.",
		`@ -0,0 +1,2 @@`
							`When\|WRB\|O Sebastian\|NNP\|B-PERSON Thrun\|NNP\|I-PERSON started\|VBD\|O working\|VBG\|O on\|IN\|O self\|NN\|O -\|HYPH\|O driving\|VBG\|O cars\|NNS\|O at\|IN\|O Google\|NNP\|B-ORG in\|IN\|O 2007\|CD\|B-DATE ,\|,\|O few\|JJ\|O people\|NNS\|O outside\|RB\|O of\|IN\|O the\|DT\|O company\|NN\|O took\|VBD\|O him\|PRP\|O seriously\|RB\|O .\|.\|O`
							`“\|''\|O I\|PRP\|O can\|MD\|O tell\|VB\|O you\|PRP\|O very\|RB\|O senior\|JJ\|O CEOs\|NNS\|O of\|IN\|O major\|JJ\|O American\|JJ\|B-NORP car\|NN\|O companies\|NNS\|O would\|MD\|O shake\|VB\|O my\|PRP$\|O hand\|NN\|O and\|CC\|O turn\|VB\|O away\|RB\|O because\|IN\|O I\|PRP\|O was\|VBD\|O n’t\|RB\|O worth\|JJ\|O talking\|VBG\|O to\|IN\|O ,\|,\|O ”\|''\|O said\|VBD\|O Thrun\|NNP\|B-PERSON ,\|,\|O in\|IN\|O an\|DT\|O interview\|NN\|O with\|IN\|O Recode\|NNP\|B-ORG earlier\|RBR\|B-DATE this\|DT\|I-DATE week\|NN\|I-DATE .\|.\|O`