💫 Make handling of [Pipe].labels consistent (#3273)

* Make handling of [Pipe].labels consistent

* Un-xfail passing test

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/tests/pipeline/test_pipe_methods.py

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Move error message to spacy.errors

* Fix textcat labels and test

* Make EntityRuler.labels return tuple as well
This commit is contained in:
Ines Montani 2019-02-14 20:03:19 +01:00 committed by Matthew Honnibal
parent 3d577b77c6
commit f146121092
5 changed files with 30 additions and 14 deletions

View File

@ -317,6 +317,10 @@ class Errors(object):
E113 = ("The newly split token can only have one root (head = 0).") E113 = ("The newly split token can only have one root (head = 0).")
E114 = ("The newly split token needs to have a root (head = 0)") E114 = ("The newly split token needs to have a root (head = 0)")
E115 = ("All subtokens must have associated heads") E115 = ("All subtokens must have associated heads")
E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
"labels before training begins. This functionality was available "
"in previous versions, but had significant bugs that led to poor "
"performance")
@add_codes @add_codes

View File

@ -86,7 +86,7 @@ class EntityRuler(object):
""" """
all_labels = set(self.token_patterns.keys()) all_labels = set(self.token_patterns.keys())
all_labels.update(self.phrase_patterns.keys()) all_labels.update(self.phrase_patterns.keys())
return all_labels return tuple(all_labels)
@property @property
def patterns(self): def patterns(self):

View File

@ -358,7 +358,7 @@ class Tagger(Pipe):
@property @property
def labels(self): def labels(self):
return self.vocab.morphology.tag_names return tuple(self.vocab.morphology.tag_names)
@property @property
def tok2vec(self): def tok2vec(self):
@ -884,11 +884,11 @@ class TextCategorizer(Pipe):
@property @property
def labels(self): def labels(self):
return self.cfg.setdefault('labels', []) return tuple(self.cfg.setdefault('labels', []))
@labels.setter @labels.setter
def labels(self, value): def labels(self, value):
self.cfg['labels'] = value self.cfg['labels'] = tuple(value)
def __call__(self, doc): def __call__(self, doc):
scores, tensors = self.predict([doc]) scores, tensors = self.predict([doc])
@ -957,17 +957,13 @@ class TextCategorizer(Pipe):
# The problem is that we resize the last layer, but the last layer # The problem is that we resize the last layer, but the last layer
# is actually just an ensemble. We're not resizing the child layers # is actually just an ensemble. We're not resizing the child layers
# -- a huge problem. # -- a huge problem.
raise ValueError( raise ValueError(Errors.E116)
"Cannot currently add labels to pre-trained text classifier. "
"Add labels before training begins. This functionality was "
"available in previous versions, but had significant bugs that "
"let to poor performance")
#smaller = self.model._layers[-1] #smaller = self.model._layers[-1]
#larger = Affine(len(self.labels)+1, smaller.nI) #larger = Affine(len(self.labels)+1, smaller.nI)
#copy_array(larger.W[:smaller.nO], smaller.W) #copy_array(larger.W[:smaller.nO], smaller.W)
#copy_array(larger.b[:smaller.nO], smaller.b) #copy_array(larger.b[:smaller.nO], smaller.b)
#self.model._layers[-1] = larger #self.model._layers[-1] = larger
self.labels.append(label) self.labels = tuple(list(self.labels) + [label])
return 1 return 1
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
@ -1012,6 +1008,11 @@ cdef class DependencyParser(Parser):
return (DependencyParser, (self.vocab, self.moves, self.model), return (DependencyParser, (self.vocab, self.moves, self.model),
None, None) None, None)
@property
def labels(self):
# Get the labels from the model by looking at the available moves
return tuple(set(move.split("-")[1] for move in self.move_names))
cdef class EntityRecognizer(Parser): cdef class EntityRecognizer(Parser):
name = "ner" name = "ner"
@ -1040,8 +1041,8 @@ cdef class EntityRecognizer(Parser):
def labels(self): def labels(self):
# Get the labels from the model by looking at the available moves, e.g. # Get the labels from the model by looking at the available moves, e.g.
# B-PERSON, I-PERSON, L-PERSON, U-PERSON # B-PERSON, I-PERSON, L-PERSON, U-PERSON
return [move.split("-")[1] for move in self.move_names return tuple(set(move.split("-")[1] for move in self.move_names
if move[0] in ("B", "I", "L", "U")] if move[0] in ("B", "I", "L", "U")))
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer'] __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']

View File

@ -112,3 +112,15 @@ def test_add_lots_of_pipes(nlp, n_pipes):
def test_raise_for_invalid_components(nlp, component): def test_raise_for_invalid_components(nlp, component):
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.add_pipe(component) nlp.add_pipe(component)
@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat"])
def test_pipe_base_class_add_label(nlp, component):
label = "TEST"
pipe = nlp.create_pipe(component)
pipe.add_label(label)
if component == "tagger":
# Tagger always has the default coarse-grained label scheme
assert label in pipe.labels
else:
assert pipe.labels == (label,)

View File

@ -11,7 +11,6 @@ import numpy
from ..util import add_vecs_to_vocab, get_doc from ..util import add_vecs_to_vocab, get_doc
@pytest.mark.xfail
def test_issue2179(): def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER.""" """Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian() nlp = Italian()
@ -23,7 +22,7 @@ def test_issue2179():
nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.add_pipe(nlp2.create_pipe("ner"))
nlp2.from_bytes(nlp.to_bytes()) nlp2.from_bytes(nlp.to_bytes())
assert "extra_labels" not in nlp2.get_pipe("ner").cfg assert "extra_labels" not in nlp2.get_pipe("ner").cfg
assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"] assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
def test_issue2219(en_vocab): def test_issue2219(en_vocab):