From f5c551a43a73c8b43bb12434fcbe4dbc65f5467f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 21 Oct 2019 12:20:28 +0200 Subject: [PATCH 1/2] Checks/errors related to ill-formed IOB input in CLI convert and debug-data (#4487) * Error for ill-formed input to iob_to_biluo() Check for empty label in iob_to_biluo(), which can result from ill-formed input. * Check for empty NER label in debug-data --- spacy/cli/debug_data.py | 3 +++ spacy/errors.py | 1 + spacy/gold.pyx | 2 ++ spacy/tests/test_gold.py | 12 +++++++++++- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index fe6cccf81..8161ddf45 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -206,6 +206,9 @@ def debug_data( missing_values, "value" if missing_values == 1 else "values" ) ) + for label in new_labels: + if len(label) == 0: + msg.fail("Empty label found in new labels") if new_labels: labels_with_counts = [ (label, count) diff --git a/spacy/errors.py b/spacy/errors.py index b2ccd3c49..23203d98a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -503,6 +503,7 @@ class Errors(object): "names: {names}") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") + E177 = ("Ill-formed IOB input detected: {tag}") @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 098df9ed2..1e626c4ed 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -522,6 +522,8 @@ def _consume_ent(tags): tags.pop(0) label = tag[2:] if length == 1: + if len(label) == 0: + raise ValueError(Errors.E177.format(tag=tag)) return ["U-" + label] else: start = "B-" + label diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 4f79c4463..234a91443 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import spans_from_biluo_tags, GoldParse +from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json from spacy.lang.en import English from spacy.tokens import Doc @@ -87,6 +87,16 @@ def test_gold_ner_missing_tags(en_tokenizer): gold = GoldParse(doc, entities=biluo_tags) # noqa: F841 +def test_iob_to_biluo(): + good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] + good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] + bad_iob = ["O", "O", "\"", "B-LOC", "I-LOC"] + converted_biluo = iob_to_biluo(good_iob) + assert good_biluo == converted_biluo + with pytest.raises(ValueError): + iob_to_biluo(bad_iob) + + def test_roundtrip_docs_to_json(): text = "I flew to Silicon Valley via London." cats = {"TRAVEL": 1.0, "BAKING": 0.0} From b2f88e206036266ab68af80a7248e630d7b4ff95 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 21 Oct 2019 12:26:07 +0200 Subject: [PATCH 2/2] Fix formatting [ci skip] --- website/docs/api/vocab.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index b94789921..e024ab54a 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -166,10 +166,9 @@ cosines are calculated in minibatches, to reduce memory usage. ## Vocab.get_vector {#get_vector tag="method" new="2"} Retrieve a vector for a word in the vocabulary. Words can be looked up by string -or hash value. If no vectors data is loaded, a `ValueError` is raised. - -If `minn` is defined, then the resulting vector uses Fasttext's -subword features by average over ngrams of `orth`. (Introduced in spaCy `v2.1`) +or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn` +is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s +subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`). > #### Example > @@ -178,12 +177,12 @@ subword features by average over ngrams of `orth`. (Introduced in spaCy `v2.1`) > nlp.vocab.get_vector("apple", minn=1, maxn=5) > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- | -| `orth` | int / unicode | The hash value of a word, or its unicode string. | -| `minn` | int | Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. | -| `maxn` | int | Maximum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. | +| Name | Type | Description | +| ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- | +| `orth` | int / unicode | The hash value of a word, or its unicode string. | +| `minn` 2.1 | int | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. | +| `maxn` 2.1 | int | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. | +| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. | ## Vocab.set_vector {#set_vector tag="method" new="2"}