💫 Improve handling of missing NER tags (closes #2603) (#3341)

* Improve handling of missing NER tags GoldParse can accept missing NER tags, if entities is provided in BILUO format (rather than as spans). Missing tags can be provided as None values. Fix bug that occurred when first tag was a None value. Closes #2603. * Document specification of missing NER tags.
2025-08-10 07:04:53 +03:00 · 2019-02-27 12:06:32 +01:00 · 2019-02-27 12:06:32 +01:00 · f1d77eb140
commit f1d77eb140
parent c478a2ccb6
4 changed files with 17 additions and 7 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -456,12 +456,16 @@ cdef class GoldParse:
        if deps is None:
            deps = [None for _ in doc]
        if entities is None:
-            entities = [None for _ in doc]
+            entities = ['-' for _ in doc]
        elif len(entities) == 0:
            entities = ['O' for _ in doc]
-        elif not isinstance(entities[0], basestring):
-            # Assume we have entities specified by character offset.
-            entities = biluo_tags_from_offsets(doc, entities)
+        else:
+            # Translate the None values to '-', to make processing easier.
+            # See Issue #2603
+            entities = [(ent if ent is not None else '-') for ent in entities]
+            if not isinstance(entities[0], basestring):
+                # Assume we have entities specified by character offset.
+                entities = biluo_tags_from_offsets(doc, entities)

        self.mem = Pool()
        self.loss = 0
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -141,7 +141,7 @@ cdef class BiluoPushDown(TransitionSystem):

    cdef Transition lookup_transition(self, object name) except *:
        cdef attr_t label
-        if name == '-' or name is None:
+        if name == '-' or name == '' or name is None:
            return Transition(clas=0, move=MISSING, label=0, score=0)
        elif name == '!O':
            return Transition(clas=0, move=ISNT, label=0, score=0)
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals

 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import spans_from_biluo_tags
+from spacy.gold import spans_from_biluo_tags, GoldParse
 from spacy.tokens import Doc


@ -62,3 +62,9 @@ def test_biluo_spans(en_tokenizer):
    assert spans[0].label_ == "LOC"
    assert spans[1].text == "London"
    assert spans[1].label_ == "GPE"
+
+def test_gold_ner_missing_tags(en_tokenizer):
+    doc = en_tokenizer("I flew to Silicon Valley via London.")
+    biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
+    gold = GoldParse(doc, entities=biluo_tags)
+ 
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@ -21,7 +21,7 @@ gradient for those labels will be zero.
 | `tags`      | iterable    | A sequence of strings, representing tag annotations.                                                                                                                                                                      |
 | `heads`     | iterable    | A sequence of integers, representing syntactic head offsets.                                                                                                                                                              |
 | `deps`      | iterable    | A sequence of strings, representing the syntactic relation types.                                                                                                                                                         |
-| `entities`  | iterable    | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions.                                                                     |
+| `entities`  | iterable    | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
 | `cats`      | dict        | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). |
 | **RETURNS** | `GoldParse` | The newly constructed object.                                                                                                                                                                                             |