From 2569339a98e39ec84680fb193fc50f42f1a3098c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 14 Feb 2019 18:05:07 +0100
Subject: [PATCH 1/3] Formatting and whitespace [ci skip]

---
 spacy/tokens/_retokenize.pyx | 33 ++-------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 8b737c8a3..99905ebc2 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -21,6 +21,7 @@ from ..attrs import intify_attrs
 from ..util import SimpleFrozenDict
 from ..errors import Errors
 
+
 cdef class Retokenizer:
     """Helper class for doc.retokenize() context manager."""
     cdef Doc doc
@@ -174,25 +175,21 @@ def _bulk_merge(Doc doc, merges):
 
     def _get_start(merge):
         return merge[0].start
-    merges.sort(key=_get_start)
 
+    merges.sort(key=_get_start)
     for merge_index, (span, attributes) in enumerate(merges):
         start = span.start
         end = span.end
         spans.append(span)
-
         # House the new merged token where it starts
         token = &doc.c[start]
-
         tokens[merge_index] = token
-
         # Assign attributes
         for attr_name, attr_value in attributes.items():
             if attr_name == TAG:
                 doc.vocab.morphology.assign_tag(token, attr_value)
             else:
                 Token.set_struct_attr(token, attr_name, attr_value)
-
     # Resize the doc.tensor, if it's set. Let the last row for each token stand
     # for the merged region. To do this, we create a boolean array indicating
     # whether the row is to be deleted, then use numpy.delete
@@ -205,7 +202,6 @@ def _bulk_merge(Doc doc, merges):
     for i, span in enumerate(spans):
         span_roots.append(span.root.i)
         tokens[i].dep = span.root.dep
-
     # We update token.lex after keeping span root and dep, since
     # setting token.lex will change span.start and span.end properties
     # as it modifies the character offsets in the doc
@@ -217,7 +213,6 @@ def _bulk_merge(Doc doc, merges):
         tokens[token_index].lex = lex
         # We set trailing space here too
         tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
-
     # Begin by setting all the head indices to absolute token positions
     # This is easier to work with for now than the offsets
     # Before thinking of something simpler, beware the case where a
@@ -225,11 +220,9 @@ def _bulk_merge(Doc doc, merges):
     # tokens changes.
     for i in range(doc.length):
         doc.c[i].head += i
-
     # Set the head of the merged token from the Span
     for i in range(len(merges)):
         tokens[i].head = doc.c[span_roots[i]].head
-
     # Adjust deps before shrinking tokens
     # Tokens which point into the merged token should now point to it
     # Subtract the offset from all tokens which point to >= end
@@ -241,16 +234,13 @@ def _bulk_merge(Doc doc, merges):
             #last token was the last of the span
             current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
             current_span_index += 1
-
         if current_span_index < len(spans) and \
                 spans[current_span_index].start <= i < spans[current_span_index].end:
             offsets.append(spans[current_span_index].start - current_offset)
         else:
             offsets.append(i - current_offset)
-
     for i in range(doc.length):
         doc.c[i].head = offsets[doc.c[i].head]
-
     # Now compress the token array
     offset = 0
     in_span = False
@@ -272,14 +262,11 @@ def _bulk_merge(Doc doc, merges):
         memset(&doc.c[i], 0, sizeof(TokenC))
         doc.c[i].lex = &EMPTY_LEXEME
     doc.length -= offset
-
     # ...And, set heads back to a relative position
     for i in range(doc.length):
         doc.c[i].head -= i
-
     # Set the left/right children, left/right edges
     set_children_from_heads(doc.c, doc.length)
-
     # Make sure ent_iob remains consistent
     for (span, _) in merges:
         if(span.end < len(offsets)):
@@ -329,13 +316,10 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
             token_head_index = index
     if token_head_index == -1:
         raise ValueError(Errors.E113)
-
     # First, make the dependencies absolutes, and adjust all possible dependencies before
     # creating the tokens
-
     for i in range(doc.length):
         doc.c[i].head += i
-
     # Adjust dependencies
     offset = nb_subtokens - 1
     for i in range(doc.length):
@@ -344,22 +328,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
             doc.c[i].head = token_head_index
         elif head_idx > token_index:
             doc.c[i].head += offset
-
     new_token_head = doc.c[token_index].head
-
     # Double doc.c max_length if necessary (until big enough for all new tokens)
     while doc.length + nb_subtokens - 1 >= doc.max_length:
         doc._realloc(doc.length * 2)
-
     # Move tokens after the split to create space for the new tokens
     doc.length = len(doc) + nb_subtokens -1
     for token_to_move in range(doc.length - 1, token_index, -1):
         doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
-
     # Host the tokens in the newly created space
     cdef int idx_offset = 0
     for i, orth in enumerate(orths):
-
         token = &doc.c[token_index + i]
         lex = doc.vocab.get(doc.mem, orth)
         token.lex = lex
@@ -367,21 +346,18 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
         if i != 0:
             token.idx = orig_token.idx + idx_offset
         idx_offset += len(orth)
-
         # Set token.spacy to False for all non-last split tokens, and
         # to origToken.spacy for the last token
         if (i < nb_subtokens - 1):
             token.spacy = False
         else:
             token.spacy = orig_token.spacy
-
         # Apply attrs to each subtoken
         for attr_name, attr_value in attrs.items():
             if attr_name == TAG:
                 doc.vocab.morphology.assign_tag(token, attr_value)
             else:
                 Token.set_struct_attr(token, attr_name, attr_value)
-
         # Make IOB consistent
         if (orig_token.ent_iob == 3):
             if i == 0:
@@ -391,22 +367,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
         else:
             # In all other cases subtokens inherit iob from origToken
             token.ent_iob = orig_token.ent_iob
-
          # Use the head of the new token everywhere. This will be partially overwritten later on.
         token.head = new_token_head
-
     # Transform the dependencies into relative ones again
     for i in range(doc.length):
         doc.c[i].head -= i
-
     # Assign correct dependencies to the inner token
     for i, head in enumerate(heads):
         if head != 0:
             # the token's head's head is already correct
             doc.c[token_index + i].head = head
-
     for i, dep in enumerate(deps):
         doc[token_index + i].dep = dep
-
     # set children from head
     set_children_from_heads(doc.c, doc.length)

From 3d577b77c660f38521daf13e84c5107e169cd6d3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 14 Feb 2019 19:56:38 +0100
Subject: [PATCH 2/3] Auto-formatting

---
 spacy/tests/doc/test_doc_spilt.py | 60 +++++++++++++++++--------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py
index 827fd565e..bcadf4eb4 100644
--- a/spacy/tests/doc/test_doc_spilt.py
+++ b/spacy/tests/doc/test_doc_spilt.py
@@ -1,12 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from ..util import get_doc
-from ...vocab import Vocab
-from ...tokens import Doc
-from ...tokens import Span
-
 import pytest
+from spacy.vocab import Vocab
+from spacy.tokens import Doc
+
+from ..util import get_doc
 
 
 def test_doc_split(en_tokenizer):
@@ -17,35 +16,41 @@ def test_doc_split(en_tokenizer):
 
     assert len(doc) == 3
     assert len(str(doc)) == 19
-    assert doc[0].head.text == 'start'
-    assert doc[1].head.text == '.'
+    assert doc[0].head.text == "start"
+    assert doc[1].head.text == "."
 
     with doc.retokenize() as retokenizer:
-        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
+        retokenizer.split(
+            doc[0],
+            ["Los", "Angeles"],
+            [1, 0],
+            attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
+        )
 
     assert len(doc) == 4
-    assert doc[0].text == 'Los'
-    assert doc[0].head.text == 'Angeles'
+    assert doc[0].text == "Los"
+    assert doc[0].head.text == "Angeles"
     assert doc[0].idx == 0
     assert doc[1].idx == 3
 
-    assert doc[1].text == 'Angeles'
-    assert doc[1].head.text == 'start'
+    assert doc[1].text == "Angeles"
+    assert doc[1].head.text == "start"
 
-    assert doc[2].text == 'start'
-    assert doc[2].head.text == '.'
+    assert doc[2].text == "start"
+    assert doc[2].head.text == "."
 
-    assert doc[3].text == '.'
-    assert doc[3].head.text == '.'
+    assert doc[3].text == "."
+    assert doc[3].head.text == "."
 
     assert len(str(doc)) == 19
 
+
 def test_split_dependencies(en_tokenizer):
     text = "LosAngeles start."
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, [t.text for t in tokens])
-    dep1 = doc.vocab.strings.add('amod')
-    dep2 = doc.vocab.strings.add('subject')
+    dep1 = doc.vocab.strings.add("amod")
+    dep2 = doc.vocab.strings.add("subject")
     with doc.retokenize() as retokenizer:
         retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
 
@@ -53,27 +58,26 @@ def test_split_dependencies(en_tokenizer):
     assert doc[1].dep == dep2
 
 
-
 def test_split_heads_error(en_tokenizer):
     text = "LosAngeles start."
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, [t.text for t in tokens])
-    #Not enough heads
+    # Not enough heads
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
             retokenizer.split(doc[0], ["Los", "Angeles"], [0])
 
-    #Too many heads
+    # Too many heads
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
             retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
 
-    #No token head
+    # No token head
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
             retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
 
-    #Several token heads
+    # Several token heads
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
             retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
@@ -83,7 +87,7 @@ def test_spans_entity_merge_iob():
     # Test entity IOB stays consistent after merging
     words = ["abc", "d", "e"]
     doc = Doc(Vocab(), words=words)
-    doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
+    doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
     assert doc[0].ent_iob_ == "B"
     assert doc[1].ent_iob_ == "I"
 
@@ -94,12 +98,14 @@ def test_spans_entity_merge_iob():
     assert doc[2].ent_iob_ == "I"
     assert doc[3].ent_iob_ == "I"
 
+
 def test_spans_sentence_update_after_merge(en_tokenizer):
+    # fmt: off
     text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
     heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
-    deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
-            'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
-            'compound', 'punct']
+    deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
+            "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
+    # fmt: on
 
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)

From f1461210922e55f2419ab36b5e7cac051c346cf1 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 14 Feb 2019 20:03:19 +0100
Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=92=AB=20=20Make=20handling=20of=20[P?=
 =?UTF-8?q?ipe].labels=20consistent=20=20(#3273)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Make handling of [Pipe].labels consistent

* Un-xfail passing test

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/tests/pipeline/test_pipe_methods.py

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Move error message to spacy.errors

* Fix textcat labels and test

* Make EntityRuler.labels return tuple as well
---
 spacy/errors.py                               |  4 ++++
 spacy/pipeline/entityruler.py                 |  2 +-
 spacy/pipeline/pipes.pyx                      | 23 ++++++++++---------
 spacy/tests/pipeline/test_pipe_methods.py     | 12 ++++++++++
 spacy/tests/regression/test_issue2001-2500.py |  3 +--
 5 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index f689963dd..37deb4560 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -317,6 +317,10 @@ class Errors(object):
     E113 = ("The newly split token can only have one root (head = 0).")
     E114 = ("The newly split token needs to have a root (head = 0)")
     E115 = ("All subtokens must have associated heads")
+    E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
+            "labels before training begins. This functionality was available "
+            "in previous versions, but had significant bugs that led to poor "
+            "performance")
 
 
 @add_codes
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index bef929411..6b757fe7c 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -86,7 +86,7 @@ class EntityRuler(object):
         """
         all_labels = set(self.token_patterns.keys())
         all_labels.update(self.phrase_patterns.keys())
-        return all_labels
+        return tuple(all_labels)
 
     @property
     def patterns(self):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 92fecb8b3..153e053e2 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -358,7 +358,7 @@ class Tagger(Pipe):
 
     @property
     def labels(self):
-        return self.vocab.morphology.tag_names
+        return tuple(self.vocab.morphology.tag_names)
 
     @property
     def tok2vec(self):
@@ -884,11 +884,11 @@ class TextCategorizer(Pipe):
 
     @property
     def labels(self):
-        return self.cfg.setdefault('labels', [])
+        return tuple(self.cfg.setdefault('labels', []))
 
     @labels.setter
     def labels(self, value):
-        self.cfg['labels'] = value
+        self.cfg['labels'] = tuple(value)
 
     def __call__(self, doc):
         scores, tensors = self.predict([doc])
@@ -957,17 +957,13 @@ class TextCategorizer(Pipe):
             # The problem is that we resize the last layer, but the last layer
             # is actually just an ensemble. We're not resizing the child layers
             # -- a huge problem.
-            raise ValueError(
-                "Cannot currently add labels to pre-trained text classifier. "
-                "Add labels before training begins. This functionality was "
-                "available in previous versions, but had significant bugs that "
-                "let to poor performance")
+            raise ValueError(Errors.E116)
             #smaller = self.model._layers[-1]
             #larger = Affine(len(self.labels)+1, smaller.nI)
             #copy_array(larger.W[:smaller.nO], smaller.W)
             #copy_array(larger.b[:smaller.nO], smaller.b)
             #self.model._layers[-1] = larger
-        self.labels.append(label)
+        self.labels = tuple(list(self.labels) + [label])
         return 1
 
     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
@@ -1012,6 +1008,11 @@ cdef class DependencyParser(Parser):
         return (DependencyParser, (self.vocab, self.moves, self.model),
                 None, None)
 
+    @property
+    def labels(self):
+        # Get the labels from the model by looking at the available moves
+        return tuple(set(move.split("-")[1] for move in self.move_names))
+
 
 cdef class EntityRecognizer(Parser):
     name = "ner"
@@ -1040,8 +1041,8 @@ cdef class EntityRecognizer(Parser):
     def labels(self):
         # Get the labels from the model by looking at the available moves, e.g.
         # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-        return [move.split("-")[1] for move in self.move_names
-                if move[0] in ("B", "I", "L", "U")]
+        return tuple(set(move.split("-")[1] for move in self.move_names
+                if move[0] in ("B", "I", "L", "U")))
 
 
 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index bd779d5c2..d36201718 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -112,3 +112,15 @@ def test_add_lots_of_pipes(nlp, n_pipes):
 def test_raise_for_invalid_components(nlp, component):
     with pytest.raises(ValueError):
         nlp.add_pipe(component)
+
+
+@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat"])
+def test_pipe_base_class_add_label(nlp, component):
+    label = "TEST"
+    pipe = nlp.create_pipe(component)
+    pipe.add_label(label)
+    if component == "tagger":
+        # Tagger always has the default coarse-grained label scheme
+        assert label in pipe.labels
+    else:
+        assert pipe.labels == (label,)
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 70040898f..3922db6d3 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -11,7 +11,6 @@ import numpy
 from ..util import add_vecs_to_vocab, get_doc
 
 
-@pytest.mark.xfail
 def test_issue2179():
     """Test that spurious 'extra_labels' aren't created when initializing NER."""
     nlp = Italian()
@@ -23,7 +22,7 @@ def test_issue2179():
     nlp2.add_pipe(nlp2.create_pipe("ner"))
     nlp2.from_bytes(nlp.to_bytes())
     assert "extra_labels" not in nlp2.get_pipe("ner").cfg
-    assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"]
+    assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
 
 
 def test_issue2219(en_vocab):