From f1461210922e55f2419ab36b5e7cac051c346cf1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 14 Feb 2019 20:03:19 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20=20Make=20handling=20of=20[Pipe]?= =?UTF-8?q?.labels=20consistent=20=20(#3273)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Make handling of [Pipe].labels consistent * Un-xfail passing test * Update spacy/pipeline/pipes.pyx Co-Authored-By: ines * Update spacy/pipeline/pipes.pyx Co-Authored-By: ines * Update spacy/tests/pipeline/test_pipe_methods.py Co-Authored-By: ines * Update spacy/pipeline/pipes.pyx Co-Authored-By: ines * Move error message to spacy.errors * Fix textcat labels and test * Make EntityRuler.labels return tuple as well --- spacy/errors.py | 4 ++++ spacy/pipeline/entityruler.py | 2 +- spacy/pipeline/pipes.pyx | 23 ++++++++++--------- spacy/tests/pipeline/test_pipe_methods.py | 12 ++++++++++ spacy/tests/regression/test_issue2001-2500.py | 3 +-- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index f689963dd..37deb4560 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -317,6 +317,10 @@ class Errors(object): E113 = ("The newly split token can only have one root (head = 0).") E114 = ("The newly split token needs to have a root (head = 0)") E115 = ("All subtokens must have associated heads") + E116 = ("Cannot currently add labels to pre-trained text classifier. Add " + "labels before training begins. This functionality was available " + "in previous versions, but had significant bugs that led to poor " + "performance") @add_codes diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index bef929411..6b757fe7c 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -86,7 +86,7 @@ class EntityRuler(object): """ all_labels = set(self.token_patterns.keys()) all_labels.update(self.phrase_patterns.keys()) - return all_labels + return tuple(all_labels) @property def patterns(self): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 92fecb8b3..153e053e2 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -358,7 +358,7 @@ class Tagger(Pipe): @property def labels(self): - return self.vocab.morphology.tag_names + return tuple(self.vocab.morphology.tag_names) @property def tok2vec(self): @@ -884,11 +884,11 @@ class TextCategorizer(Pipe): @property def labels(self): - return self.cfg.setdefault('labels', []) + return tuple(self.cfg.setdefault('labels', [])) @labels.setter def labels(self, value): - self.cfg['labels'] = value + self.cfg['labels'] = tuple(value) def __call__(self, doc): scores, tensors = self.predict([doc]) @@ -957,17 +957,13 @@ class TextCategorizer(Pipe): # The problem is that we resize the last layer, but the last layer # is actually just an ensemble. We're not resizing the child layers # -- a huge problem. - raise ValueError( - "Cannot currently add labels to pre-trained text classifier. " - "Add labels before training begins. This functionality was " - "available in previous versions, but had significant bugs that " - "let to poor performance") + raise ValueError(Errors.E116) #smaller = self.model._layers[-1] #larger = Affine(len(self.labels)+1, smaller.nI) #copy_array(larger.W[:smaller.nO], smaller.W) #copy_array(larger.b[:smaller.nO], smaller.b) #self.model._layers[-1] = larger - self.labels.append(label) + self.labels = tuple(list(self.labels) + [label]) return 1 def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, @@ -1012,6 +1008,11 @@ cdef class DependencyParser(Parser): return (DependencyParser, (self.vocab, self.moves, self.model), None, None) + @property + def labels(self): + # Get the labels from the model by looking at the available moves + return tuple(set(move.split("-")[1] for move in self.move_names)) + cdef class EntityRecognizer(Parser): name = "ner" @@ -1040,8 +1041,8 @@ cdef class EntityRecognizer(Parser): def labels(self): # Get the labels from the model by looking at the available moves, e.g. # B-PERSON, I-PERSON, L-PERSON, U-PERSON - return [move.split("-")[1] for move in self.move_names - if move[0] in ("B", "I", "L", "U")] + return tuple(set(move.split("-")[1] for move in self.move_names + if move[0] in ("B", "I", "L", "U"))) __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer'] diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index bd779d5c2..d36201718 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -112,3 +112,15 @@ def test_add_lots_of_pipes(nlp, n_pipes): def test_raise_for_invalid_components(nlp, component): with pytest.raises(ValueError): nlp.add_pipe(component) + + +@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat"]) +def test_pipe_base_class_add_label(nlp, component): + label = "TEST" + pipe = nlp.create_pipe(component) + pipe.add_label(label) + if component == "tagger": + # Tagger always has the default coarse-grained label scheme + assert label in pipe.labels + else: + assert pipe.labels == (label,) diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 70040898f..3922db6d3 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -11,7 +11,6 @@ import numpy from ..util import add_vecs_to_vocab, get_doc -@pytest.mark.xfail def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() @@ -23,7 +22,7 @@ def test_issue2179(): nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.from_bytes(nlp.to_bytes()) assert "extra_labels" not in nlp2.get_pipe("ner").cfg - assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"] + assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) def test_issue2219(en_vocab):