mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						0371ac23e7
					
				| 
						 | 
					@ -317,6 +317,10 @@ class Errors(object):
 | 
				
			||||||
    E113 = ("The newly split token can only have one root (head = 0).")
 | 
					    E113 = ("The newly split token can only have one root (head = 0).")
 | 
				
			||||||
    E114 = ("The newly split token needs to have a root (head = 0)")
 | 
					    E114 = ("The newly split token needs to have a root (head = 0)")
 | 
				
			||||||
    E115 = ("All subtokens must have associated heads")
 | 
					    E115 = ("All subtokens must have associated heads")
 | 
				
			||||||
 | 
					    E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
 | 
				
			||||||
 | 
					            "labels before training begins. This functionality was available "
 | 
				
			||||||
 | 
					            "in previous versions, but had significant bugs that led to poor "
 | 
				
			||||||
 | 
					            "performance")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,7 +86,7 @@ class EntityRuler(object):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        all_labels = set(self.token_patterns.keys())
 | 
					        all_labels = set(self.token_patterns.keys())
 | 
				
			||||||
        all_labels.update(self.phrase_patterns.keys())
 | 
					        all_labels.update(self.phrase_patterns.keys())
 | 
				
			||||||
        return all_labels
 | 
					        return tuple(all_labels)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def patterns(self):
 | 
					    def patterns(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -358,7 +358,7 @@ class Tagger(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def labels(self):
 | 
					    def labels(self):
 | 
				
			||||||
        return self.vocab.morphology.tag_names
 | 
					        return tuple(self.vocab.morphology.tag_names)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def tok2vec(self):
 | 
					    def tok2vec(self):
 | 
				
			||||||
| 
						 | 
					@ -884,11 +884,11 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def labels(self):
 | 
					    def labels(self):
 | 
				
			||||||
        return self.cfg.setdefault('labels', [])
 | 
					        return tuple(self.cfg.setdefault('labels', []))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @labels.setter
 | 
					    @labels.setter
 | 
				
			||||||
    def labels(self, value):
 | 
					    def labels(self, value):
 | 
				
			||||||
        self.cfg['labels'] = value
 | 
					        self.cfg['labels'] = tuple(value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, doc):
 | 
					    def __call__(self, doc):
 | 
				
			||||||
        scores, tensors = self.predict([doc])
 | 
					        scores, tensors = self.predict([doc])
 | 
				
			||||||
| 
						 | 
					@ -957,17 +957,13 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
            # The problem is that we resize the last layer, but the last layer
 | 
					            # The problem is that we resize the last layer, but the last layer
 | 
				
			||||||
            # is actually just an ensemble. We're not resizing the child layers
 | 
					            # is actually just an ensemble. We're not resizing the child layers
 | 
				
			||||||
            # -- a huge problem.
 | 
					            # -- a huge problem.
 | 
				
			||||||
            raise ValueError(
 | 
					            raise ValueError(Errors.E116)
 | 
				
			||||||
                "Cannot currently add labels to pre-trained text classifier. "
 | 
					 | 
				
			||||||
                "Add labels before training begins. This functionality was "
 | 
					 | 
				
			||||||
                "available in previous versions, but had significant bugs that "
 | 
					 | 
				
			||||||
                "let to poor performance")
 | 
					 | 
				
			||||||
            #smaller = self.model._layers[-1]
 | 
					            #smaller = self.model._layers[-1]
 | 
				
			||||||
            #larger = Affine(len(self.labels)+1, smaller.nI)
 | 
					            #larger = Affine(len(self.labels)+1, smaller.nI)
 | 
				
			||||||
            #copy_array(larger.W[:smaller.nO], smaller.W)
 | 
					            #copy_array(larger.W[:smaller.nO], smaller.W)
 | 
				
			||||||
            #copy_array(larger.b[:smaller.nO], smaller.b)
 | 
					            #copy_array(larger.b[:smaller.nO], smaller.b)
 | 
				
			||||||
            #self.model._layers[-1] = larger
 | 
					            #self.model._layers[-1] = larger
 | 
				
			||||||
        self.labels.append(label)
 | 
					        self.labels = tuple(list(self.labels) + [label])
 | 
				
			||||||
        return 1
 | 
					        return 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
 | 
					    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
 | 
				
			||||||
| 
						 | 
					@ -1012,6 +1008,11 @@ cdef class DependencyParser(Parser):
 | 
				
			||||||
        return (DependencyParser, (self.vocab, self.moves, self.model),
 | 
					        return (DependencyParser, (self.vocab, self.moves, self.model),
 | 
				
			||||||
                None, None)
 | 
					                None, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def labels(self):
 | 
				
			||||||
 | 
					        # Get the labels from the model by looking at the available moves
 | 
				
			||||||
 | 
					        return tuple(set(move.split("-")[1] for move in self.move_names))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class EntityRecognizer(Parser):
 | 
					cdef class EntityRecognizer(Parser):
 | 
				
			||||||
    name = "ner"
 | 
					    name = "ner"
 | 
				
			||||||
| 
						 | 
					@ -1040,8 +1041,8 @@ cdef class EntityRecognizer(Parser):
 | 
				
			||||||
    def labels(self):
 | 
					    def labels(self):
 | 
				
			||||||
        # Get the labels from the model by looking at the available moves, e.g.
 | 
					        # Get the labels from the model by looking at the available moves, e.g.
 | 
				
			||||||
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
 | 
					        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
 | 
				
			||||||
        return [move.split("-")[1] for move in self.move_names
 | 
					        return tuple(set(move.split("-")[1] for move in self.move_names
 | 
				
			||||||
                if move[0] in ("B", "I", "L", "U")]
 | 
					                if move[0] in ("B", "I", "L", "U")))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
 | 
					__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,11 @@
 | 
				
			||||||
# coding: utf-8
 | 
					# coding: utf-8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import get_doc
 | 
					 | 
				
			||||||
from ...vocab import Vocab
 | 
					 | 
				
			||||||
from ...tokens import Doc
 | 
					 | 
				
			||||||
from ...tokens import Span
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..util import get_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_doc_split(en_tokenizer):
 | 
					def test_doc_split(en_tokenizer):
 | 
				
			||||||
| 
						 | 
					@ -17,35 +16,41 @@ def test_doc_split(en_tokenizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert len(doc) == 3
 | 
					    assert len(doc) == 3
 | 
				
			||||||
    assert len(str(doc)) == 19
 | 
					    assert len(str(doc)) == 19
 | 
				
			||||||
    assert doc[0].head.text == 'start'
 | 
					    assert doc[0].head.text == "start"
 | 
				
			||||||
    assert doc[1].head.text == '.'
 | 
					    assert doc[1].head.text == "."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with doc.retokenize() as retokenizer:
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
 | 
					        retokenizer.split(
 | 
				
			||||||
 | 
					            doc[0],
 | 
				
			||||||
 | 
					            ["Los", "Angeles"],
 | 
				
			||||||
 | 
					            [1, 0],
 | 
				
			||||||
 | 
					            attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert len(doc) == 4
 | 
					    assert len(doc) == 4
 | 
				
			||||||
    assert doc[0].text == 'Los'
 | 
					    assert doc[0].text == "Los"
 | 
				
			||||||
    assert doc[0].head.text == 'Angeles'
 | 
					    assert doc[0].head.text == "Angeles"
 | 
				
			||||||
    assert doc[0].idx == 0
 | 
					    assert doc[0].idx == 0
 | 
				
			||||||
    assert doc[1].idx == 3
 | 
					    assert doc[1].idx == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert doc[1].text == 'Angeles'
 | 
					    assert doc[1].text == "Angeles"
 | 
				
			||||||
    assert doc[1].head.text == 'start'
 | 
					    assert doc[1].head.text == "start"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert doc[2].text == 'start'
 | 
					    assert doc[2].text == "start"
 | 
				
			||||||
    assert doc[2].head.text == '.'
 | 
					    assert doc[2].head.text == "."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert doc[3].text == '.'
 | 
					    assert doc[3].text == "."
 | 
				
			||||||
    assert doc[3].head.text == '.'
 | 
					    assert doc[3].head.text == "."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert len(str(doc)) == 19
 | 
					    assert len(str(doc)) == 19
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_split_dependencies(en_tokenizer):
 | 
					def test_split_dependencies(en_tokenizer):
 | 
				
			||||||
    text = "LosAngeles start."
 | 
					    text = "LosAngeles start."
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    doc = get_doc(tokens.vocab, [t.text for t in tokens])
 | 
					    doc = get_doc(tokens.vocab, [t.text for t in tokens])
 | 
				
			||||||
    dep1 = doc.vocab.strings.add('amod')
 | 
					    dep1 = doc.vocab.strings.add("amod")
 | 
				
			||||||
    dep2 = doc.vocab.strings.add('subject')
 | 
					    dep2 = doc.vocab.strings.add("subject")
 | 
				
			||||||
    with doc.retokenize() as retokenizer:
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
 | 
					        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -53,27 +58,26 @@ def test_split_dependencies(en_tokenizer):
 | 
				
			||||||
    assert doc[1].dep == dep2
 | 
					    assert doc[1].dep == dep2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_split_heads_error(en_tokenizer):
 | 
					def test_split_heads_error(en_tokenizer):
 | 
				
			||||||
    text = "LosAngeles start."
 | 
					    text = "LosAngeles start."
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    doc = get_doc(tokens.vocab, [t.text for t in tokens])
 | 
					    doc = get_doc(tokens.vocab, [t.text for t in tokens])
 | 
				
			||||||
    #Not enough heads
 | 
					    # Not enough heads
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        with doc.retokenize() as retokenizer:
 | 
					        with doc.retokenize() as retokenizer:
 | 
				
			||||||
            retokenizer.split(doc[0], ["Los", "Angeles"], [0])
 | 
					            retokenizer.split(doc[0], ["Los", "Angeles"], [0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #Too many heads
 | 
					    # Too many heads
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        with doc.retokenize() as retokenizer:
 | 
					        with doc.retokenize() as retokenizer:
 | 
				
			||||||
            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
 | 
					            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #No token head
 | 
					    # No token head
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        with doc.retokenize() as retokenizer:
 | 
					        with doc.retokenize() as retokenizer:
 | 
				
			||||||
            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
 | 
					            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #Several token heads
 | 
					    # Several token heads
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        with doc.retokenize() as retokenizer:
 | 
					        with doc.retokenize() as retokenizer:
 | 
				
			||||||
            retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
 | 
					            retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
 | 
				
			||||||
| 
						 | 
					@ -83,7 +87,7 @@ def test_spans_entity_merge_iob():
 | 
				
			||||||
    # Test entity IOB stays consistent after merging
 | 
					    # Test entity IOB stays consistent after merging
 | 
				
			||||||
    words = ["abc", "d", "e"]
 | 
					    words = ["abc", "d", "e"]
 | 
				
			||||||
    doc = Doc(Vocab(), words=words)
 | 
					    doc = Doc(Vocab(), words=words)
 | 
				
			||||||
    doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
 | 
					    doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
 | 
				
			||||||
    assert doc[0].ent_iob_ == "B"
 | 
					    assert doc[0].ent_iob_ == "B"
 | 
				
			||||||
    assert doc[1].ent_iob_ == "I"
 | 
					    assert doc[1].ent_iob_ == "I"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -94,12 +98,14 @@ def test_spans_entity_merge_iob():
 | 
				
			||||||
    assert doc[2].ent_iob_ == "I"
 | 
					    assert doc[2].ent_iob_ == "I"
 | 
				
			||||||
    assert doc[3].ent_iob_ == "I"
 | 
					    assert doc[3].ent_iob_ == "I"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_spans_sentence_update_after_merge(en_tokenizer):
 | 
					def test_spans_sentence_update_after_merge(en_tokenizer):
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
    text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
 | 
					    text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
 | 
				
			||||||
    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
 | 
					    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
 | 
				
			||||||
    deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
 | 
					    deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
 | 
				
			||||||
            'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
 | 
					            "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
 | 
				
			||||||
            'compound', 'punct']
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
 | 
					    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -112,3 +112,15 @@ def test_add_lots_of_pipes(nlp, n_pipes):
 | 
				
			||||||
def test_raise_for_invalid_components(nlp, component):
 | 
					def test_raise_for_invalid_components(nlp, component):
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        nlp.add_pipe(component)
 | 
					        nlp.add_pipe(component)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat"])
 | 
				
			||||||
 | 
					def test_pipe_base_class_add_label(nlp, component):
 | 
				
			||||||
 | 
					    label = "TEST"
 | 
				
			||||||
 | 
					    pipe = nlp.create_pipe(component)
 | 
				
			||||||
 | 
					    pipe.add_label(label)
 | 
				
			||||||
 | 
					    if component == "tagger":
 | 
				
			||||||
 | 
					        # Tagger always has the default coarse-grained label scheme
 | 
				
			||||||
 | 
					        assert label in pipe.labels
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        assert pipe.labels == (label,)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,7 +11,6 @@ import numpy
 | 
				
			||||||
from ..util import add_vecs_to_vocab, get_doc
 | 
					from ..util import add_vecs_to_vocab, get_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					 | 
				
			||||||
def test_issue2179():
 | 
					def test_issue2179():
 | 
				
			||||||
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
 | 
					    """Test that spurious 'extra_labels' aren't created when initializing NER."""
 | 
				
			||||||
    nlp = Italian()
 | 
					    nlp = Italian()
 | 
				
			||||||
| 
						 | 
					@ -23,7 +22,7 @@ def test_issue2179():
 | 
				
			||||||
    nlp2.add_pipe(nlp2.create_pipe("ner"))
 | 
					    nlp2.add_pipe(nlp2.create_pipe("ner"))
 | 
				
			||||||
    nlp2.from_bytes(nlp.to_bytes())
 | 
					    nlp2.from_bytes(nlp.to_bytes())
 | 
				
			||||||
    assert "extra_labels" not in nlp2.get_pipe("ner").cfg
 | 
					    assert "extra_labels" not in nlp2.get_pipe("ner").cfg
 | 
				
			||||||
    assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"]
 | 
					    assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_issue2219(en_vocab):
 | 
					def test_issue2219(en_vocab):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -21,6 +21,7 @@ from ..attrs import intify_attrs
 | 
				
			||||||
from ..util import SimpleFrozenDict
 | 
					from ..util import SimpleFrozenDict
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Retokenizer:
 | 
					cdef class Retokenizer:
 | 
				
			||||||
    """Helper class for doc.retokenize() context manager."""
 | 
					    """Helper class for doc.retokenize() context manager."""
 | 
				
			||||||
    cdef Doc doc
 | 
					    cdef Doc doc
 | 
				
			||||||
| 
						 | 
					@ -174,25 +175,21 @@ def _bulk_merge(Doc doc, merges):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_start(merge):
 | 
					    def _get_start(merge):
 | 
				
			||||||
        return merge[0].start
 | 
					        return merge[0].start
 | 
				
			||||||
    merges.sort(key=_get_start)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    merges.sort(key=_get_start)
 | 
				
			||||||
    for merge_index, (span, attributes) in enumerate(merges):
 | 
					    for merge_index, (span, attributes) in enumerate(merges):
 | 
				
			||||||
        start = span.start
 | 
					        start = span.start
 | 
				
			||||||
        end = span.end
 | 
					        end = span.end
 | 
				
			||||||
        spans.append(span)
 | 
					        spans.append(span)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # House the new merged token where it starts
 | 
					        # House the new merged token where it starts
 | 
				
			||||||
        token = &doc.c[start]
 | 
					        token = &doc.c[start]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        tokens[merge_index] = token
 | 
					        tokens[merge_index] = token
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Assign attributes
 | 
					        # Assign attributes
 | 
				
			||||||
        for attr_name, attr_value in attributes.items():
 | 
					        for attr_name, attr_value in attributes.items():
 | 
				
			||||||
            if attr_name == TAG:
 | 
					            if attr_name == TAG:
 | 
				
			||||||
                doc.vocab.morphology.assign_tag(token, attr_value)
 | 
					                doc.vocab.morphology.assign_tag(token, attr_value)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                Token.set_struct_attr(token, attr_name, attr_value)
 | 
					                Token.set_struct_attr(token, attr_name, attr_value)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Resize the doc.tensor, if it's set. Let the last row for each token stand
 | 
					    # Resize the doc.tensor, if it's set. Let the last row for each token stand
 | 
				
			||||||
    # for the merged region. To do this, we create a boolean array indicating
 | 
					    # for the merged region. To do this, we create a boolean array indicating
 | 
				
			||||||
    # whether the row is to be deleted, then use numpy.delete
 | 
					    # whether the row is to be deleted, then use numpy.delete
 | 
				
			||||||
| 
						 | 
					@ -205,7 +202,6 @@ def _bulk_merge(Doc doc, merges):
 | 
				
			||||||
    for i, span in enumerate(spans):
 | 
					    for i, span in enumerate(spans):
 | 
				
			||||||
        span_roots.append(span.root.i)
 | 
					        span_roots.append(span.root.i)
 | 
				
			||||||
        tokens[i].dep = span.root.dep
 | 
					        tokens[i].dep = span.root.dep
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # We update token.lex after keeping span root and dep, since
 | 
					    # We update token.lex after keeping span root and dep, since
 | 
				
			||||||
    # setting token.lex will change span.start and span.end properties
 | 
					    # setting token.lex will change span.start and span.end properties
 | 
				
			||||||
    # as it modifies the character offsets in the doc
 | 
					    # as it modifies the character offsets in the doc
 | 
				
			||||||
| 
						 | 
					@ -217,7 +213,6 @@ def _bulk_merge(Doc doc, merges):
 | 
				
			||||||
        tokens[token_index].lex = lex
 | 
					        tokens[token_index].lex = lex
 | 
				
			||||||
        # We set trailing space here too
 | 
					        # We set trailing space here too
 | 
				
			||||||
        tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
 | 
					        tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Begin by setting all the head indices to absolute token positions
 | 
					    # Begin by setting all the head indices to absolute token positions
 | 
				
			||||||
    # This is easier to work with for now than the offsets
 | 
					    # This is easier to work with for now than the offsets
 | 
				
			||||||
    # Before thinking of something simpler, beware the case where a
 | 
					    # Before thinking of something simpler, beware the case where a
 | 
				
			||||||
| 
						 | 
					@ -225,11 +220,9 @@ def _bulk_merge(Doc doc, merges):
 | 
				
			||||||
    # tokens changes.
 | 
					    # tokens changes.
 | 
				
			||||||
    for i in range(doc.length):
 | 
					    for i in range(doc.length):
 | 
				
			||||||
        doc.c[i].head += i
 | 
					        doc.c[i].head += i
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Set the head of the merged token from the Span
 | 
					    # Set the head of the merged token from the Span
 | 
				
			||||||
    for i in range(len(merges)):
 | 
					    for i in range(len(merges)):
 | 
				
			||||||
        tokens[i].head = doc.c[span_roots[i]].head
 | 
					        tokens[i].head = doc.c[span_roots[i]].head
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Adjust deps before shrinking tokens
 | 
					    # Adjust deps before shrinking tokens
 | 
				
			||||||
    # Tokens which point into the merged token should now point to it
 | 
					    # Tokens which point into the merged token should now point to it
 | 
				
			||||||
    # Subtract the offset from all tokens which point to >= end
 | 
					    # Subtract the offset from all tokens which point to >= end
 | 
				
			||||||
| 
						 | 
					@ -241,16 +234,13 @@ def _bulk_merge(Doc doc, merges):
 | 
				
			||||||
            #last token was the last of the span
 | 
					            #last token was the last of the span
 | 
				
			||||||
            current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
 | 
					            current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
 | 
				
			||||||
            current_span_index += 1
 | 
					            current_span_index += 1
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if current_span_index < len(spans) and \
 | 
					        if current_span_index < len(spans) and \
 | 
				
			||||||
                spans[current_span_index].start <= i < spans[current_span_index].end:
 | 
					                spans[current_span_index].start <= i < spans[current_span_index].end:
 | 
				
			||||||
            offsets.append(spans[current_span_index].start - current_offset)
 | 
					            offsets.append(spans[current_span_index].start - current_offset)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            offsets.append(i - current_offset)
 | 
					            offsets.append(i - current_offset)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    for i in range(doc.length):
 | 
					    for i in range(doc.length):
 | 
				
			||||||
        doc.c[i].head = offsets[doc.c[i].head]
 | 
					        doc.c[i].head = offsets[doc.c[i].head]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Now compress the token array
 | 
					    # Now compress the token array
 | 
				
			||||||
    offset = 0
 | 
					    offset = 0
 | 
				
			||||||
    in_span = False
 | 
					    in_span = False
 | 
				
			||||||
| 
						 | 
					@ -272,14 +262,11 @@ def _bulk_merge(Doc doc, merges):
 | 
				
			||||||
        memset(&doc.c[i], 0, sizeof(TokenC))
 | 
					        memset(&doc.c[i], 0, sizeof(TokenC))
 | 
				
			||||||
        doc.c[i].lex = &EMPTY_LEXEME
 | 
					        doc.c[i].lex = &EMPTY_LEXEME
 | 
				
			||||||
    doc.length -= offset
 | 
					    doc.length -= offset
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # ...And, set heads back to a relative position
 | 
					    # ...And, set heads back to a relative position
 | 
				
			||||||
    for i in range(doc.length):
 | 
					    for i in range(doc.length):
 | 
				
			||||||
        doc.c[i].head -= i
 | 
					        doc.c[i].head -= i
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Set the left/right children, left/right edges
 | 
					    # Set the left/right children, left/right edges
 | 
				
			||||||
    set_children_from_heads(doc.c, doc.length)
 | 
					    set_children_from_heads(doc.c, doc.length)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Make sure ent_iob remains consistent
 | 
					    # Make sure ent_iob remains consistent
 | 
				
			||||||
    for (span, _) in merges:
 | 
					    for (span, _) in merges:
 | 
				
			||||||
        if(span.end < len(offsets)):
 | 
					        if(span.end < len(offsets)):
 | 
				
			||||||
| 
						 | 
					@ -329,13 +316,10 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
 | 
				
			||||||
            token_head_index = index
 | 
					            token_head_index = index
 | 
				
			||||||
    if token_head_index == -1:
 | 
					    if token_head_index == -1:
 | 
				
			||||||
        raise ValueError(Errors.E113)
 | 
					        raise ValueError(Errors.E113)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # First, make the dependencies absolutes, and adjust all possible dependencies before
 | 
					    # First, make the dependencies absolutes, and adjust all possible dependencies before
 | 
				
			||||||
    # creating the tokens
 | 
					    # creating the tokens
 | 
				
			||||||
 | 
					 | 
				
			||||||
    for i in range(doc.length):
 | 
					    for i in range(doc.length):
 | 
				
			||||||
        doc.c[i].head += i
 | 
					        doc.c[i].head += i
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Adjust dependencies
 | 
					    # Adjust dependencies
 | 
				
			||||||
    offset = nb_subtokens - 1
 | 
					    offset = nb_subtokens - 1
 | 
				
			||||||
    for i in range(doc.length):
 | 
					    for i in range(doc.length):
 | 
				
			||||||
| 
						 | 
					@ -344,22 +328,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
 | 
				
			||||||
            doc.c[i].head = token_head_index
 | 
					            doc.c[i].head = token_head_index
 | 
				
			||||||
        elif head_idx > token_index:
 | 
					        elif head_idx > token_index:
 | 
				
			||||||
            doc.c[i].head += offset
 | 
					            doc.c[i].head += offset
 | 
				
			||||||
 | 
					 | 
				
			||||||
    new_token_head = doc.c[token_index].head
 | 
					    new_token_head = doc.c[token_index].head
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Double doc.c max_length if necessary (until big enough for all new tokens)
 | 
					    # Double doc.c max_length if necessary (until big enough for all new tokens)
 | 
				
			||||||
    while doc.length + nb_subtokens - 1 >= doc.max_length:
 | 
					    while doc.length + nb_subtokens - 1 >= doc.max_length:
 | 
				
			||||||
        doc._realloc(doc.length * 2)
 | 
					        doc._realloc(doc.length * 2)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Move tokens after the split to create space for the new tokens
 | 
					    # Move tokens after the split to create space for the new tokens
 | 
				
			||||||
    doc.length = len(doc) + nb_subtokens -1
 | 
					    doc.length = len(doc) + nb_subtokens -1
 | 
				
			||||||
    for token_to_move in range(doc.length - 1, token_index, -1):
 | 
					    for token_to_move in range(doc.length - 1, token_index, -1):
 | 
				
			||||||
        doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
 | 
					        doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Host the tokens in the newly created space
 | 
					    # Host the tokens in the newly created space
 | 
				
			||||||
    cdef int idx_offset = 0
 | 
					    cdef int idx_offset = 0
 | 
				
			||||||
    for i, orth in enumerate(orths):
 | 
					    for i, orth in enumerate(orths):
 | 
				
			||||||
 | 
					 | 
				
			||||||
        token = &doc.c[token_index + i]
 | 
					        token = &doc.c[token_index + i]
 | 
				
			||||||
        lex = doc.vocab.get(doc.mem, orth)
 | 
					        lex = doc.vocab.get(doc.mem, orth)
 | 
				
			||||||
        token.lex = lex
 | 
					        token.lex = lex
 | 
				
			||||||
| 
						 | 
					@ -367,21 +346,18 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
 | 
				
			||||||
        if i != 0:
 | 
					        if i != 0:
 | 
				
			||||||
            token.idx = orig_token.idx + idx_offset
 | 
					            token.idx = orig_token.idx + idx_offset
 | 
				
			||||||
        idx_offset += len(orth)
 | 
					        idx_offset += len(orth)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Set token.spacy to False for all non-last split tokens, and
 | 
					        # Set token.spacy to False for all non-last split tokens, and
 | 
				
			||||||
        # to origToken.spacy for the last token
 | 
					        # to origToken.spacy for the last token
 | 
				
			||||||
        if (i < nb_subtokens - 1):
 | 
					        if (i < nb_subtokens - 1):
 | 
				
			||||||
            token.spacy = False
 | 
					            token.spacy = False
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            token.spacy = orig_token.spacy
 | 
					            token.spacy = orig_token.spacy
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Apply attrs to each subtoken
 | 
					        # Apply attrs to each subtoken
 | 
				
			||||||
        for attr_name, attr_value in attrs.items():
 | 
					        for attr_name, attr_value in attrs.items():
 | 
				
			||||||
            if attr_name == TAG:
 | 
					            if attr_name == TAG:
 | 
				
			||||||
                doc.vocab.morphology.assign_tag(token, attr_value)
 | 
					                doc.vocab.morphology.assign_tag(token, attr_value)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                Token.set_struct_attr(token, attr_name, attr_value)
 | 
					                Token.set_struct_attr(token, attr_name, attr_value)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Make IOB consistent
 | 
					        # Make IOB consistent
 | 
				
			||||||
        if (orig_token.ent_iob == 3):
 | 
					        if (orig_token.ent_iob == 3):
 | 
				
			||||||
            if i == 0:
 | 
					            if i == 0:
 | 
				
			||||||
| 
						 | 
					@ -391,22 +367,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            # In all other cases subtokens inherit iob from origToken
 | 
					            # In all other cases subtokens inherit iob from origToken
 | 
				
			||||||
            token.ent_iob = orig_token.ent_iob
 | 
					            token.ent_iob = orig_token.ent_iob
 | 
				
			||||||
 | 
					 | 
				
			||||||
         # Use the head of the new token everywhere. This will be partially overwritten later on.
 | 
					         # Use the head of the new token everywhere. This will be partially overwritten later on.
 | 
				
			||||||
        token.head = new_token_head
 | 
					        token.head = new_token_head
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Transform the dependencies into relative ones again
 | 
					    # Transform the dependencies into relative ones again
 | 
				
			||||||
    for i in range(doc.length):
 | 
					    for i in range(doc.length):
 | 
				
			||||||
        doc.c[i].head -= i
 | 
					        doc.c[i].head -= i
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Assign correct dependencies to the inner token
 | 
					    # Assign correct dependencies to the inner token
 | 
				
			||||||
    for i, head in enumerate(heads):
 | 
					    for i, head in enumerate(heads):
 | 
				
			||||||
        if head != 0:
 | 
					        if head != 0:
 | 
				
			||||||
            # the token's head's head is already correct
 | 
					            # the token's head's head is already correct
 | 
				
			||||||
            doc.c[token_index + i].head = head
 | 
					            doc.c[token_index + i].head = head
 | 
				
			||||||
 | 
					 | 
				
			||||||
    for i, dep in enumerate(deps):
 | 
					    for i, dep in enumerate(deps):
 | 
				
			||||||
        doc[token_index + i].dep = dep
 | 
					        doc[token_index + i].dep = dep
 | 
				
			||||||
 | 
					 | 
				
			||||||
    # set children from head
 | 
					    # set children from head
 | 
				
			||||||
    set_children_from_heads(doc.c, doc.length)
 | 
					    set_children_from_heads(doc.c, doc.length)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user