From f5c551a43a73c8b43bb12434fcbe4dbc65f5467f Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Mon, 21 Oct 2019 12:20:28 +0200
Subject: [PATCH 1/2] Checks/errors related to ill-formed IOB input in CLI
 convert and debug-data (#4487)

* Error for ill-formed input to iob_to_biluo()

Check for empty label in iob_to_biluo(), which can result from
ill-formed input.

* Check for empty NER label in debug-data
---
 spacy/cli/debug_data.py  |  3 +++
 spacy/errors.py          |  1 +
 spacy/gold.pyx           |  2 ++
 spacy/tests/test_gold.py | 12 +++++++++++-
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index fe6cccf81..8161ddf45 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -206,6 +206,9 @@ def debug_data(
                 missing_values, "value" if missing_values == 1 else "values"
             )
         )
+        for label in new_labels:
+            if len(label) == 0:
+                msg.fail("Empty label found in new labels")
         if new_labels:
             labels_with_counts = [
                 (label, count)
diff --git a/spacy/errors.py b/spacy/errors.py
index b2ccd3c49..23203d98a 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -503,6 +503,7 @@ class Errors(object):
             "names: {names}")
     E175 = ("Can't remove rule for unknown match pattern ID: {key}")
     E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
+    E177 = ("Ill-formed IOB input detected: {tag}")
 
 
 @add_codes
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 098df9ed2..1e626c4ed 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -522,6 +522,8 @@ def _consume_ent(tags):
         tags.pop(0)
     label = tag[2:]
     if length == 1:
+        if len(label) == 0:
+            raise ValueError(Errors.E177.format(tag=tag))
         return ["U-" + label]
     else:
         start = "B-" + label
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 4f79c4463..234a91443 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import spans_from_biluo_tags, GoldParse
+from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
 from spacy.gold import GoldCorpus, docs_to_json
 from spacy.lang.en import English
 from spacy.tokens import Doc
@@ -87,6 +87,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
     gold = GoldParse(doc, entities=biluo_tags)  # noqa: F841
 
 
+def test_iob_to_biluo():
+    good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
+    good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
+    bad_iob = ["O", "O", "\"", "B-LOC", "I-LOC"]
+    converted_biluo = iob_to_biluo(good_iob)
+    assert good_biluo == converted_biluo
+    with pytest.raises(ValueError):
+        iob_to_biluo(bad_iob)
+
+
 def test_roundtrip_docs_to_json():
     text = "I flew to Silicon Valley via London."
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}

From b2f88e206036266ab68af80a7248e630d7b4ff95 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Oct 2019 12:26:07 +0200
Subject: [PATCH 2/2] Fix formatting [ci skip]

---
 website/docs/api/vocab.md | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index b94789921..e024ab54a 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -166,10 +166,9 @@ cosines are calculated in minibatches, to reduce memory usage.
 ## Vocab.get_vector {#get_vector tag="method" new="2"}
 
 Retrieve a vector for a word in the vocabulary. Words can be looked up by string
-or hash value. If no vectors data is loaded, a `ValueError` is raised.
-
-If `minn` is defined, then the resulting vector uses Fasttext's 
-subword features by average over ngrams of `orth`. (Introduced in spaCy `v2.1`)
+or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
+is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
+subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
 
 > #### Example
 >
@@ -178,12 +177,12 @@ subword features by average over ngrams of `orth`. (Introduced in spaCy `v2.1`)
 > nlp.vocab.get_vector("apple", minn=1, maxn=5)
 > ```
 
-| Name        | Type                                     | Description                                                                                    |
-| ----------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- |
-| `orth`      | int / unicode                            | The hash value of a word, or its unicode string.                                               |
-| `minn`      | int                                      | Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. |
-| `maxn`      | int                                      | Maximum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. |
-| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance.                  |
+| Name                                | Type                                     | Description                                                                                    |
+| ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- |
+| `orth`                              | int / unicode                            | The hash value of a word, or its unicode string.                                               |
+| `minn` <Tag variant="new">2.1</Tag> | int                                      | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
+| `maxn` <Tag variant="new">2.1</Tag> | int                                      | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
+| **RETURNS**                         | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance.                  |
 
 ## Vocab.set_vector {#set_vector tag="method" new="2"}