💫 Improve handling of missing NER tags (closes #2603) (#3341)

* Improve handling of missing NER tags

GoldParse can accept missing NER tags, if entities is provided
in BILUO format (rather than as spans). Missing tags can be provided
as None values.

Fix bug that occurred when first tag was a None value. Closes #2603.

* Document specification of missing NER tags.
This commit is contained in:
Matthew Honnibal 2019-02-27 12:06:32 +01:00 committed by GitHub
parent c478a2ccb6
commit f1d77eb140
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 17 additions and 7 deletions

View File

@ -456,12 +456,16 @@ cdef class GoldParse:
if deps is None:
deps = [None for _ in doc]
if entities is None:
entities = [None for _ in doc]
entities = ['-' for _ in doc]
elif len(entities) == 0:
entities = ['O' for _ in doc]
elif not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
else:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else '-') for ent in entities]
if not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
self.mem = Pool()
self.loss = 0

View File

@ -141,7 +141,7 @@ cdef class BiluoPushDown(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name is None:
if name == '-' or name == '' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, GoldParse
from spacy.tokens import Doc
@ -62,3 +62,9 @@ def test_biluo_spans(en_tokenizer):
assert spans[0].label_ == "LOC"
assert spans[1].text == "London"
assert spans[1].label_ == "GPE"
def test_gold_ner_missing_tags(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
gold = GoldParse(doc, entities=biluo_tags)

View File

@ -21,7 +21,7 @@ gradient for those labels will be zero.
| `tags` | iterable | A sequence of strings, representing tag annotations. |
| `heads` | iterable | A sequence of integers, representing syntactic head offsets. |
| `deps` | iterable | A sequence of strings, representing the syntactic relation types. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). |
| **RETURNS** | `GoldParse` | The newly constructed object. |