From 2569339a98e39ec84680fb193fc50f42f1a3098c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 14 Feb 2019 18:05:07 +0100 Subject: [PATCH 1/3] Formatting and whitespace [ci skip] --- spacy/tokens/_retokenize.pyx | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 8b737c8a3..99905ebc2 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -21,6 +21,7 @@ from ..attrs import intify_attrs from ..util import SimpleFrozenDict from ..errors import Errors + cdef class Retokenizer: """Helper class for doc.retokenize() context manager.""" cdef Doc doc @@ -174,25 +175,21 @@ def _bulk_merge(Doc doc, merges): def _get_start(merge): return merge[0].start - merges.sort(key=_get_start) + merges.sort(key=_get_start) for merge_index, (span, attributes) in enumerate(merges): start = span.start end = span.end spans.append(span) - # House the new merged token where it starts token = &doc.c[start] - tokens[merge_index] = token - # Assign attributes for attr_name, attr_value in attributes.items(): if attr_name == TAG: doc.vocab.morphology.assign_tag(token, attr_value) else: Token.set_struct_attr(token, attr_name, attr_value) - # Resize the doc.tensor, if it's set. Let the last row for each token stand # for the merged region. To do this, we create a boolean array indicating # whether the row is to be deleted, then use numpy.delete @@ -205,7 +202,6 @@ def _bulk_merge(Doc doc, merges): for i, span in enumerate(spans): span_roots.append(span.root.i) tokens[i].dep = span.root.dep - # We update token.lex after keeping span root and dep, since # setting token.lex will change span.start and span.end properties # as it modifies the character offsets in the doc @@ -217,7 +213,6 @@ def _bulk_merge(Doc doc, merges): tokens[token_index].lex = lex # We set trailing space here too tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy - # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a @@ -225,11 +220,9 @@ def _bulk_merge(Doc doc, merges): # tokens changes. for i in range(doc.length): doc.c[i].head += i - # Set the head of the merged token from the Span for i in range(len(merges)): tokens[i].head = doc.c[span_roots[i]].head - # Adjust deps before shrinking tokens # Tokens which point into the merged token should now point to it # Subtract the offset from all tokens which point to >= end @@ -241,16 +234,13 @@ def _bulk_merge(Doc doc, merges): #last token was the last of the span current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1 current_span_index += 1 - if current_span_index < len(spans) and \ spans[current_span_index].start <= i < spans[current_span_index].end: offsets.append(spans[current_span_index].start - current_offset) else: offsets.append(i - current_offset) - for i in range(doc.length): doc.c[i].head = offsets[doc.c[i].head] - # Now compress the token array offset = 0 in_span = False @@ -272,14 +262,11 @@ def _bulk_merge(Doc doc, merges): memset(&doc.c[i], 0, sizeof(TokenC)) doc.c[i].lex = &EMPTY_LEXEME doc.length -= offset - # ...And, set heads back to a relative position for i in range(doc.length): doc.c[i].head -= i - # Set the left/right children, left/right edges set_children_from_heads(doc.c, doc.length) - # Make sure ent_iob remains consistent for (span, _) in merges: if(span.end < len(offsets)): @@ -329,13 +316,10 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs): token_head_index = index if token_head_index == -1: raise ValueError(Errors.E113) - # First, make the dependencies absolutes, and adjust all possible dependencies before # creating the tokens - for i in range(doc.length): doc.c[i].head += i - # Adjust dependencies offset = nb_subtokens - 1 for i in range(doc.length): @@ -344,22 +328,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs): doc.c[i].head = token_head_index elif head_idx > token_index: doc.c[i].head += offset - new_token_head = doc.c[token_index].head - # Double doc.c max_length if necessary (until big enough for all new tokens) while doc.length + nb_subtokens - 1 >= doc.max_length: doc._realloc(doc.length * 2) - # Move tokens after the split to create space for the new tokens doc.length = len(doc) + nb_subtokens -1 for token_to_move in range(doc.length - 1, token_index, -1): doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] - # Host the tokens in the newly created space cdef int idx_offset = 0 for i, orth in enumerate(orths): - token = &doc.c[token_index + i] lex = doc.vocab.get(doc.mem, orth) token.lex = lex @@ -367,21 +346,18 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs): if i != 0: token.idx = orig_token.idx + idx_offset idx_offset += len(orth) - # Set token.spacy to False for all non-last split tokens, and # to origToken.spacy for the last token if (i < nb_subtokens - 1): token.spacy = False else: token.spacy = orig_token.spacy - # Apply attrs to each subtoken for attr_name, attr_value in attrs.items(): if attr_name == TAG: doc.vocab.morphology.assign_tag(token, attr_value) else: Token.set_struct_attr(token, attr_name, attr_value) - # Make IOB consistent if (orig_token.ent_iob == 3): if i == 0: @@ -391,22 +367,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs): else: # In all other cases subtokens inherit iob from origToken token.ent_iob = orig_token.ent_iob - # Use the head of the new token everywhere. This will be partially overwritten later on. token.head = new_token_head - # Transform the dependencies into relative ones again for i in range(doc.length): doc.c[i].head -= i - # Assign correct dependencies to the inner token for i, head in enumerate(heads): if head != 0: # the token's head's head is already correct doc.c[token_index + i].head = head - for i, dep in enumerate(deps): doc[token_index + i].dep = dep - # set children from head set_children_from_heads(doc.c, doc.length) From 3d577b77c660f38521daf13e84c5107e169cd6d3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 14 Feb 2019 19:56:38 +0100 Subject: [PATCH 2/3] Auto-formatting --- spacy/tests/doc/test_doc_spilt.py | 60 +++++++++++++++++-------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py index 827fd565e..bcadf4eb4 100644 --- a/spacy/tests/doc/test_doc_spilt.py +++ b/spacy/tests/doc/test_doc_spilt.py @@ -1,12 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from ..util import get_doc -from ...vocab import Vocab -from ...tokens import Doc -from ...tokens import Span - import pytest +from spacy.vocab import Vocab +from spacy.tokens import Doc + +from ..util import get_doc def test_doc_split(en_tokenizer): @@ -17,35 +16,41 @@ def test_doc_split(en_tokenizer): assert len(doc) == 3 assert len(str(doc)) == 19 - assert doc[0].head.text == 'start' - assert doc[1].head.text == '.' + assert doc[0].head.text == "start" + assert doc[1].head.text == "." with doc.retokenize() as retokenizer: - retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'}) + retokenizer.split( + doc[0], + ["Los", "Angeles"], + [1, 0], + attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}, + ) assert len(doc) == 4 - assert doc[0].text == 'Los' - assert doc[0].head.text == 'Angeles' + assert doc[0].text == "Los" + assert doc[0].head.text == "Angeles" assert doc[0].idx == 0 assert doc[1].idx == 3 - assert doc[1].text == 'Angeles' - assert doc[1].head.text == 'start' + assert doc[1].text == "Angeles" + assert doc[1].head.text == "start" - assert doc[2].text == 'start' - assert doc[2].head.text == '.' + assert doc[2].text == "start" + assert doc[2].head.text == "." - assert doc[3].text == '.' - assert doc[3].head.text == '.' + assert doc[3].text == "." + assert doc[3].head.text == "." assert len(str(doc)) == 19 + def test_split_dependencies(en_tokenizer): text = "LosAngeles start." tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, [t.text for t in tokens]) - dep1 = doc.vocab.strings.add('amod') - dep2 = doc.vocab.strings.add('subject') + dep1 = doc.vocab.strings.add("amod") + dep2 = doc.vocab.strings.add("subject") with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2]) @@ -53,27 +58,26 @@ def test_split_dependencies(en_tokenizer): assert doc[1].dep == dep2 - def test_split_heads_error(en_tokenizer): text = "LosAngeles start." tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, [t.text for t in tokens]) - #Not enough heads + # Not enough heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [0]) - #Too many heads + # Too many heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0]) - #No token head + # No token head with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1]) - #Several token heads + # Several token heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0]) @@ -83,7 +87,7 @@ def test_spans_entity_merge_iob(): # Test entity IOB stays consistent after merging words = ["abc", "d", "e"] doc = Doc(Vocab(), words=words) - doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)] + doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" @@ -94,12 +98,14 @@ def test_spans_entity_merge_iob(): assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "I" + def test_spans_sentence_update_after_merge(en_tokenizer): + # fmt: off text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale." heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2] - deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', - 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', - 'compound', 'punct'] + deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj", + "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"] + # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) From f1461210922e55f2419ab36b5e7cac051c346cf1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 14 Feb 2019 20:03:19 +0100 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=92=AB=20=20Make=20handling=20of=20[P?= =?UTF-8?q?ipe].labels=20consistent=20=20(#3273)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Make handling of [Pipe].labels consistent * Un-xfail passing test * Update spacy/pipeline/pipes.pyx Co-Authored-By: ines * Update spacy/pipeline/pipes.pyx Co-Authored-By: ines * Update spacy/tests/pipeline/test_pipe_methods.py Co-Authored-By: ines * Update spacy/pipeline/pipes.pyx Co-Authored-By: ines * Move error message to spacy.errors * Fix textcat labels and test * Make EntityRuler.labels return tuple as well --- spacy/errors.py | 4 ++++ spacy/pipeline/entityruler.py | 2 +- spacy/pipeline/pipes.pyx | 23 ++++++++++--------- spacy/tests/pipeline/test_pipe_methods.py | 12 ++++++++++ spacy/tests/regression/test_issue2001-2500.py | 3 +-- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index f689963dd..37deb4560 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -317,6 +317,10 @@ class Errors(object): E113 = ("The newly split token can only have one root (head = 0).") E114 = ("The newly split token needs to have a root (head = 0)") E115 = ("All subtokens must have associated heads") + E116 = ("Cannot currently add labels to pre-trained text classifier. Add " + "labels before training begins. This functionality was available " + "in previous versions, but had significant bugs that led to poor " + "performance") @add_codes diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index bef929411..6b757fe7c 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -86,7 +86,7 @@ class EntityRuler(object): """ all_labels = set(self.token_patterns.keys()) all_labels.update(self.phrase_patterns.keys()) - return all_labels + return tuple(all_labels) @property def patterns(self): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 92fecb8b3..153e053e2 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -358,7 +358,7 @@ class Tagger(Pipe): @property def labels(self): - return self.vocab.morphology.tag_names + return tuple(self.vocab.morphology.tag_names) @property def tok2vec(self): @@ -884,11 +884,11 @@ class TextCategorizer(Pipe): @property def labels(self): - return self.cfg.setdefault('labels', []) + return tuple(self.cfg.setdefault('labels', [])) @labels.setter def labels(self, value): - self.cfg['labels'] = value + self.cfg['labels'] = tuple(value) def __call__(self, doc): scores, tensors = self.predict([doc]) @@ -957,17 +957,13 @@ class TextCategorizer(Pipe): # The problem is that we resize the last layer, but the last layer # is actually just an ensemble. We're not resizing the child layers # -- a huge problem. - raise ValueError( - "Cannot currently add labels to pre-trained text classifier. " - "Add labels before training begins. This functionality was " - "available in previous versions, but had significant bugs that " - "let to poor performance") + raise ValueError(Errors.E116) #smaller = self.model._layers[-1] #larger = Affine(len(self.labels)+1, smaller.nI) #copy_array(larger.W[:smaller.nO], smaller.W) #copy_array(larger.b[:smaller.nO], smaller.b) #self.model._layers[-1] = larger - self.labels.append(label) + self.labels = tuple(list(self.labels) + [label]) return 1 def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, @@ -1012,6 +1008,11 @@ cdef class DependencyParser(Parser): return (DependencyParser, (self.vocab, self.moves, self.model), None, None) + @property + def labels(self): + # Get the labels from the model by looking at the available moves + return tuple(set(move.split("-")[1] for move in self.move_names)) + cdef class EntityRecognizer(Parser): name = "ner" @@ -1040,8 +1041,8 @@ cdef class EntityRecognizer(Parser): def labels(self): # Get the labels from the model by looking at the available moves, e.g. # B-PERSON, I-PERSON, L-PERSON, U-PERSON - return [move.split("-")[1] for move in self.move_names - if move[0] in ("B", "I", "L", "U")] + return tuple(set(move.split("-")[1] for move in self.move_names + if move[0] in ("B", "I", "L", "U"))) __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer'] diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index bd779d5c2..d36201718 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -112,3 +112,15 @@ def test_add_lots_of_pipes(nlp, n_pipes): def test_raise_for_invalid_components(nlp, component): with pytest.raises(ValueError): nlp.add_pipe(component) + + +@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat"]) +def test_pipe_base_class_add_label(nlp, component): + label = "TEST" + pipe = nlp.create_pipe(component) + pipe.add_label(label) + if component == "tagger": + # Tagger always has the default coarse-grained label scheme + assert label in pipe.labels + else: + assert pipe.labels == (label,) diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 70040898f..3922db6d3 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -11,7 +11,6 @@ import numpy from ..util import add_vecs_to_vocab, get_doc -@pytest.mark.xfail def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() @@ -23,7 +22,7 @@ def test_issue2179(): nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.from_bytes(nlp.to_bytes()) assert "extra_labels" not in nlp2.get_pipe("ner").cfg - assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"] + assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) def test_issue2219(en_vocab):