diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 91204f671..756e6a5fa 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -456,12 +456,16 @@ cdef class GoldParse: if deps is None: deps = [None for _ in doc] if entities is None: - entities = [None for _ in doc] + entities = ['-' for _ in doc] elif len(entities) == 0: entities = ['O' for _ in doc] - elif not isinstance(entities[0], basestring): - # Assume we have entities specified by character offset. - entities = biluo_tags_from_offsets(doc, entities) + else: + # Translate the None values to '-', to make processing easier. + # See Issue #2603 + entities = [(ent if ent is not None else '-') for ent in entities] + if not isinstance(entities[0], basestring): + # Assume we have entities specified by character offset. + entities = biluo_tags_from_offsets(doc, entities) self.mem = Pool() self.loss = 0 diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 7a1d34c00..b43a879d4 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -141,7 +141,7 @@ cdef class BiluoPushDown(TransitionSystem): cdef Transition lookup_transition(self, object name) except *: cdef attr_t label - if name == '-' or name is None: + if name == '-' or name == '' or name is None: return Transition(clas=0, move=MISSING, label=0, score=0) elif name == '!O': return Transition(clas=0, move=ISNT, label=0, score=0) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 30dd2e6c6..21024312c 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import spans_from_biluo_tags +from spacy.gold import spans_from_biluo_tags, GoldParse from spacy.tokens import Doc @@ -62,3 +62,9 @@ def test_biluo_spans(en_tokenizer): assert spans[0].label_ == "LOC" assert spans[1].text == "London" assert spans[1].label_ == "GPE" + +def test_gold_ner_missing_tags(en_tokenizer): + doc = en_tokenizer("I flew to Silicon Valley via London.") + biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] + gold = GoldParse(doc, entities=biluo_tags) + diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 1f71c5d58..82a850dbd 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -21,7 +21,7 @@ gradient for those labels will be zero. | `tags` | iterable | A sequence of strings, representing tag annotations. | | `heads` | iterable | A sequence of integers, representing syntactic head offsets. | | `deps` | iterable | A sequence of strings, representing the syntactic relation types. | -| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. | +| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | | **RETURNS** | `GoldParse` | The newly constructed object. |