mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
0371ac23e7
|
@ -317,6 +317,10 @@ class Errors(object):
|
||||||
E113 = ("The newly split token can only have one root (head = 0).")
|
E113 = ("The newly split token can only have one root (head = 0).")
|
||||||
E114 = ("The newly split token needs to have a root (head = 0)")
|
E114 = ("The newly split token needs to have a root (head = 0)")
|
||||||
E115 = ("All subtokens must have associated heads")
|
E115 = ("All subtokens must have associated heads")
|
||||||
|
E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
|
||||||
|
"labels before training begins. This functionality was available "
|
||||||
|
"in previous versions, but had significant bugs that led to poor "
|
||||||
|
"performance")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -86,7 +86,7 @@ class EntityRuler(object):
|
||||||
"""
|
"""
|
||||||
all_labels = set(self.token_patterns.keys())
|
all_labels = set(self.token_patterns.keys())
|
||||||
all_labels.update(self.phrase_patterns.keys())
|
all_labels.update(self.phrase_patterns.keys())
|
||||||
return all_labels
|
return tuple(all_labels)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def patterns(self):
|
def patterns(self):
|
||||||
|
|
|
@ -358,7 +358,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return self.vocab.morphology.tag_names
|
return tuple(self.vocab.morphology.tag_names)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
|
@ -884,11 +884,11 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return self.cfg.setdefault('labels', [])
|
return tuple(self.cfg.setdefault('labels', []))
|
||||||
|
|
||||||
@labels.setter
|
@labels.setter
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
self.cfg['labels'] = value
|
self.cfg['labels'] = tuple(value)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
scores, tensors = self.predict([doc])
|
scores, tensors = self.predict([doc])
|
||||||
|
@ -957,17 +957,13 @@ class TextCategorizer(Pipe):
|
||||||
# The problem is that we resize the last layer, but the last layer
|
# The problem is that we resize the last layer, but the last layer
|
||||||
# is actually just an ensemble. We're not resizing the child layers
|
# is actually just an ensemble. We're not resizing the child layers
|
||||||
# -- a huge problem.
|
# -- a huge problem.
|
||||||
raise ValueError(
|
raise ValueError(Errors.E116)
|
||||||
"Cannot currently add labels to pre-trained text classifier. "
|
|
||||||
"Add labels before training begins. This functionality was "
|
|
||||||
"available in previous versions, but had significant bugs that "
|
|
||||||
"let to poor performance")
|
|
||||||
#smaller = self.model._layers[-1]
|
#smaller = self.model._layers[-1]
|
||||||
#larger = Affine(len(self.labels)+1, smaller.nI)
|
#larger = Affine(len(self.labels)+1, smaller.nI)
|
||||||
#copy_array(larger.W[:smaller.nO], smaller.W)
|
#copy_array(larger.W[:smaller.nO], smaller.W)
|
||||||
#copy_array(larger.b[:smaller.nO], smaller.b)
|
#copy_array(larger.b[:smaller.nO], smaller.b)
|
||||||
#self.model._layers[-1] = larger
|
#self.model._layers[-1] = larger
|
||||||
self.labels.append(label)
|
self.labels = tuple(list(self.labels) + [label])
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||||
|
@ -1012,6 +1008,11 @@ cdef class DependencyParser(Parser):
|
||||||
return (DependencyParser, (self.vocab, self.moves, self.model),
|
return (DependencyParser, (self.vocab, self.moves, self.model),
|
||||||
None, None)
|
None, None)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self):
|
||||||
|
# Get the labels from the model by looking at the available moves
|
||||||
|
return tuple(set(move.split("-")[1] for move in self.move_names))
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
name = "ner"
|
name = "ner"
|
||||||
|
@ -1040,8 +1041,8 @@ cdef class EntityRecognizer(Parser):
|
||||||
def labels(self):
|
def labels(self):
|
||||||
# Get the labels from the model by looking at the available moves, e.g.
|
# Get the labels from the model by looking at the available moves, e.g.
|
||||||
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
|
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
|
||||||
return [move.split("-")[1] for move in self.move_names
|
return tuple(set(move.split("-")[1] for move in self.move_names
|
||||||
if move[0] in ("B", "I", "L", "U")]
|
if move[0] in ("B", "I", "L", "U")))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
|
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
from ...vocab import Vocab
|
|
||||||
from ...tokens import Doc
|
|
||||||
from ...tokens import Span
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_doc_split(en_tokenizer):
|
def test_doc_split(en_tokenizer):
|
||||||
|
@ -17,35 +16,41 @@ def test_doc_split(en_tokenizer):
|
||||||
|
|
||||||
assert len(doc) == 3
|
assert len(doc) == 3
|
||||||
assert len(str(doc)) == 19
|
assert len(str(doc)) == 19
|
||||||
assert doc[0].head.text == 'start'
|
assert doc[0].head.text == "start"
|
||||||
assert doc[1].head.text == '.'
|
assert doc[1].head.text == "."
|
||||||
|
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
|
retokenizer.split(
|
||||||
|
doc[0],
|
||||||
|
["Los", "Angeles"],
|
||||||
|
[1, 0],
|
||||||
|
attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
|
||||||
|
)
|
||||||
|
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == 'Los'
|
assert doc[0].text == "Los"
|
||||||
assert doc[0].head.text == 'Angeles'
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[0].idx == 0
|
assert doc[0].idx == 0
|
||||||
assert doc[1].idx == 3
|
assert doc[1].idx == 3
|
||||||
|
|
||||||
assert doc[1].text == 'Angeles'
|
assert doc[1].text == "Angeles"
|
||||||
assert doc[1].head.text == 'start'
|
assert doc[1].head.text == "start"
|
||||||
|
|
||||||
assert doc[2].text == 'start'
|
assert doc[2].text == "start"
|
||||||
assert doc[2].head.text == '.'
|
assert doc[2].head.text == "."
|
||||||
|
|
||||||
assert doc[3].text == '.'
|
assert doc[3].text == "."
|
||||||
assert doc[3].head.text == '.'
|
assert doc[3].head.text == "."
|
||||||
|
|
||||||
assert len(str(doc)) == 19
|
assert len(str(doc)) == 19
|
||||||
|
|
||||||
|
|
||||||
def test_split_dependencies(en_tokenizer):
|
def test_split_dependencies(en_tokenizer):
|
||||||
text = "LosAngeles start."
|
text = "LosAngeles start."
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||||
dep1 = doc.vocab.strings.add('amod')
|
dep1 = doc.vocab.strings.add("amod")
|
||||||
dep2 = doc.vocab.strings.add('subject')
|
dep2 = doc.vocab.strings.add("subject")
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
|
||||||
|
|
||||||
|
@ -53,27 +58,26 @@ def test_split_dependencies(en_tokenizer):
|
||||||
assert doc[1].dep == dep2
|
assert doc[1].dep == dep2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_split_heads_error(en_tokenizer):
|
def test_split_heads_error(en_tokenizer):
|
||||||
text = "LosAngeles start."
|
text = "LosAngeles start."
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||||
#Not enough heads
|
# Not enough heads
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [0])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [0])
|
||||||
|
|
||||||
#Too many heads
|
# Too many heads
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
|
||||||
|
|
||||||
#No token head
|
# No token head
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
|
||||||
|
|
||||||
#Several token heads
|
# Several token heads
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
|
||||||
|
@ -83,7 +87,7 @@ def test_spans_entity_merge_iob():
|
||||||
# Test entity IOB stays consistent after merging
|
# Test entity IOB stays consistent after merging
|
||||||
words = ["abc", "d", "e"]
|
words = ["abc", "d", "e"]
|
||||||
doc = Doc(Vocab(), words=words)
|
doc = Doc(Vocab(), words=words)
|
||||||
doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
|
doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
|
||||||
assert doc[0].ent_iob_ == "B"
|
assert doc[0].ent_iob_ == "B"
|
||||||
assert doc[1].ent_iob_ == "I"
|
assert doc[1].ent_iob_ == "I"
|
||||||
|
|
||||||
|
@ -94,12 +98,14 @@ def test_spans_entity_merge_iob():
|
||||||
assert doc[2].ent_iob_ == "I"
|
assert doc[2].ent_iob_ == "I"
|
||||||
assert doc[3].ent_iob_ == "I"
|
assert doc[3].ent_iob_ == "I"
|
||||||
|
|
||||||
|
|
||||||
def test_spans_sentence_update_after_merge(en_tokenizer):
|
def test_spans_sentence_update_after_merge(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
|
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
|
||||||
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
|
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
|
||||||
deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
|
||||||
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
"ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
|
||||||
'compound', 'punct']
|
# fmt: on
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
|
@ -112,3 +112,15 @@ def test_add_lots_of_pipes(nlp, n_pipes):
|
||||||
def test_raise_for_invalid_components(nlp, component):
|
def test_raise_for_invalid_components(nlp, component):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.add_pipe(component)
|
nlp.add_pipe(component)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat"])
|
||||||
|
def test_pipe_base_class_add_label(nlp, component):
|
||||||
|
label = "TEST"
|
||||||
|
pipe = nlp.create_pipe(component)
|
||||||
|
pipe.add_label(label)
|
||||||
|
if component == "tagger":
|
||||||
|
# Tagger always has the default coarse-grained label scheme
|
||||||
|
assert label in pipe.labels
|
||||||
|
else:
|
||||||
|
assert pipe.labels == (label,)
|
||||||
|
|
|
@ -11,7 +11,6 @@ import numpy
|
||||||
from ..util import add_vecs_to_vocab, get_doc
|
from ..util import add_vecs_to_vocab, get_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue2179():
|
def test_issue2179():
|
||||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
@ -23,7 +22,7 @@ def test_issue2179():
|
||||||
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
||||||
nlp2.from_bytes(nlp.to_bytes())
|
nlp2.from_bytes(nlp.to_bytes())
|
||||||
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
|
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
|
||||||
assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"]
|
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
|
||||||
|
|
||||||
|
|
||||||
def test_issue2219(en_vocab):
|
def test_issue2219(en_vocab):
|
||||||
|
|
|
@ -21,6 +21,7 @@ from ..attrs import intify_attrs
|
||||||
from ..util import SimpleFrozenDict
|
from ..util import SimpleFrozenDict
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
cdef class Retokenizer:
|
cdef class Retokenizer:
|
||||||
"""Helper class for doc.retokenize() context manager."""
|
"""Helper class for doc.retokenize() context manager."""
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
@ -174,25 +175,21 @@ def _bulk_merge(Doc doc, merges):
|
||||||
|
|
||||||
def _get_start(merge):
|
def _get_start(merge):
|
||||||
return merge[0].start
|
return merge[0].start
|
||||||
merges.sort(key=_get_start)
|
|
||||||
|
|
||||||
|
merges.sort(key=_get_start)
|
||||||
for merge_index, (span, attributes) in enumerate(merges):
|
for merge_index, (span, attributes) in enumerate(merges):
|
||||||
start = span.start
|
start = span.start
|
||||||
end = span.end
|
end = span.end
|
||||||
spans.append(span)
|
spans.append(span)
|
||||||
|
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
token = &doc.c[start]
|
token = &doc.c[start]
|
||||||
|
|
||||||
tokens[merge_index] = token
|
tokens[merge_index] = token
|
||||||
|
|
||||||
# Assign attributes
|
# Assign attributes
|
||||||
for attr_name, attr_value in attributes.items():
|
for attr_name, attr_value in attributes.items():
|
||||||
if attr_name == TAG:
|
if attr_name == TAG:
|
||||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||||
else:
|
else:
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
|
||||||
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||||
# for the merged region. To do this, we create a boolean array indicating
|
# for the merged region. To do this, we create a boolean array indicating
|
||||||
# whether the row is to be deleted, then use numpy.delete
|
# whether the row is to be deleted, then use numpy.delete
|
||||||
|
@ -205,7 +202,6 @@ def _bulk_merge(Doc doc, merges):
|
||||||
for i, span in enumerate(spans):
|
for i, span in enumerate(spans):
|
||||||
span_roots.append(span.root.i)
|
span_roots.append(span.root.i)
|
||||||
tokens[i].dep = span.root.dep
|
tokens[i].dep = span.root.dep
|
||||||
|
|
||||||
# We update token.lex after keeping span root and dep, since
|
# We update token.lex after keeping span root and dep, since
|
||||||
# setting token.lex will change span.start and span.end properties
|
# setting token.lex will change span.start and span.end properties
|
||||||
# as it modifies the character offsets in the doc
|
# as it modifies the character offsets in the doc
|
||||||
|
@ -217,7 +213,6 @@ def _bulk_merge(Doc doc, merges):
|
||||||
tokens[token_index].lex = lex
|
tokens[token_index].lex = lex
|
||||||
# We set trailing space here too
|
# We set trailing space here too
|
||||||
tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
|
tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
|
||||||
|
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a
|
# Before thinking of something simpler, beware the case where a
|
||||||
|
@ -225,11 +220,9 @@ def _bulk_merge(Doc doc, merges):
|
||||||
# tokens changes.
|
# tokens changes.
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head += i
|
doc.c[i].head += i
|
||||||
|
|
||||||
# Set the head of the merged token from the Span
|
# Set the head of the merged token from the Span
|
||||||
for i in range(len(merges)):
|
for i in range(len(merges)):
|
||||||
tokens[i].head = doc.c[span_roots[i]].head
|
tokens[i].head = doc.c[span_roots[i]].head
|
||||||
|
|
||||||
# Adjust deps before shrinking tokens
|
# Adjust deps before shrinking tokens
|
||||||
# Tokens which point into the merged token should now point to it
|
# Tokens which point into the merged token should now point to it
|
||||||
# Subtract the offset from all tokens which point to >= end
|
# Subtract the offset from all tokens which point to >= end
|
||||||
|
@ -241,16 +234,13 @@ def _bulk_merge(Doc doc, merges):
|
||||||
#last token was the last of the span
|
#last token was the last of the span
|
||||||
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
|
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
|
||||||
current_span_index += 1
|
current_span_index += 1
|
||||||
|
|
||||||
if current_span_index < len(spans) and \
|
if current_span_index < len(spans) and \
|
||||||
spans[current_span_index].start <= i < spans[current_span_index].end:
|
spans[current_span_index].start <= i < spans[current_span_index].end:
|
||||||
offsets.append(spans[current_span_index].start - current_offset)
|
offsets.append(spans[current_span_index].start - current_offset)
|
||||||
else:
|
else:
|
||||||
offsets.append(i - current_offset)
|
offsets.append(i - current_offset)
|
||||||
|
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head = offsets[doc.c[i].head]
|
doc.c[i].head = offsets[doc.c[i].head]
|
||||||
|
|
||||||
# Now compress the token array
|
# Now compress the token array
|
||||||
offset = 0
|
offset = 0
|
||||||
in_span = False
|
in_span = False
|
||||||
|
@ -272,14 +262,11 @@ def _bulk_merge(Doc doc, merges):
|
||||||
memset(&doc.c[i], 0, sizeof(TokenC))
|
memset(&doc.c[i], 0, sizeof(TokenC))
|
||||||
doc.c[i].lex = &EMPTY_LEXEME
|
doc.c[i].lex = &EMPTY_LEXEME
|
||||||
doc.length -= offset
|
doc.length -= offset
|
||||||
|
|
||||||
# ...And, set heads back to a relative position
|
# ...And, set heads back to a relative position
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head -= i
|
doc.c[i].head -= i
|
||||||
|
|
||||||
# Set the left/right children, left/right edges
|
# Set the left/right children, left/right edges
|
||||||
set_children_from_heads(doc.c, doc.length)
|
set_children_from_heads(doc.c, doc.length)
|
||||||
|
|
||||||
# Make sure ent_iob remains consistent
|
# Make sure ent_iob remains consistent
|
||||||
for (span, _) in merges:
|
for (span, _) in merges:
|
||||||
if(span.end < len(offsets)):
|
if(span.end < len(offsets)):
|
||||||
|
@ -329,13 +316,10 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
|
||||||
token_head_index = index
|
token_head_index = index
|
||||||
if token_head_index == -1:
|
if token_head_index == -1:
|
||||||
raise ValueError(Errors.E113)
|
raise ValueError(Errors.E113)
|
||||||
|
|
||||||
# First, make the dependencies absolutes, and adjust all possible dependencies before
|
# First, make the dependencies absolutes, and adjust all possible dependencies before
|
||||||
# creating the tokens
|
# creating the tokens
|
||||||
|
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head += i
|
doc.c[i].head += i
|
||||||
|
|
||||||
# Adjust dependencies
|
# Adjust dependencies
|
||||||
offset = nb_subtokens - 1
|
offset = nb_subtokens - 1
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
|
@ -344,22 +328,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
|
||||||
doc.c[i].head = token_head_index
|
doc.c[i].head = token_head_index
|
||||||
elif head_idx > token_index:
|
elif head_idx > token_index:
|
||||||
doc.c[i].head += offset
|
doc.c[i].head += offset
|
||||||
|
|
||||||
new_token_head = doc.c[token_index].head
|
new_token_head = doc.c[token_index].head
|
||||||
|
|
||||||
# Double doc.c max_length if necessary (until big enough for all new tokens)
|
# Double doc.c max_length if necessary (until big enough for all new tokens)
|
||||||
while doc.length + nb_subtokens - 1 >= doc.max_length:
|
while doc.length + nb_subtokens - 1 >= doc.max_length:
|
||||||
doc._realloc(doc.length * 2)
|
doc._realloc(doc.length * 2)
|
||||||
|
|
||||||
# Move tokens after the split to create space for the new tokens
|
# Move tokens after the split to create space for the new tokens
|
||||||
doc.length = len(doc) + nb_subtokens -1
|
doc.length = len(doc) + nb_subtokens -1
|
||||||
for token_to_move in range(doc.length - 1, token_index, -1):
|
for token_to_move in range(doc.length - 1, token_index, -1):
|
||||||
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
|
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
|
||||||
|
|
||||||
# Host the tokens in the newly created space
|
# Host the tokens in the newly created space
|
||||||
cdef int idx_offset = 0
|
cdef int idx_offset = 0
|
||||||
for i, orth in enumerate(orths):
|
for i, orth in enumerate(orths):
|
||||||
|
|
||||||
token = &doc.c[token_index + i]
|
token = &doc.c[token_index + i]
|
||||||
lex = doc.vocab.get(doc.mem, orth)
|
lex = doc.vocab.get(doc.mem, orth)
|
||||||
token.lex = lex
|
token.lex = lex
|
||||||
|
@ -367,21 +346,18 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
|
||||||
if i != 0:
|
if i != 0:
|
||||||
token.idx = orig_token.idx + idx_offset
|
token.idx = orig_token.idx + idx_offset
|
||||||
idx_offset += len(orth)
|
idx_offset += len(orth)
|
||||||
|
|
||||||
# Set token.spacy to False for all non-last split tokens, and
|
# Set token.spacy to False for all non-last split tokens, and
|
||||||
# to origToken.spacy for the last token
|
# to origToken.spacy for the last token
|
||||||
if (i < nb_subtokens - 1):
|
if (i < nb_subtokens - 1):
|
||||||
token.spacy = False
|
token.spacy = False
|
||||||
else:
|
else:
|
||||||
token.spacy = orig_token.spacy
|
token.spacy = orig_token.spacy
|
||||||
|
|
||||||
# Apply attrs to each subtoken
|
# Apply attrs to each subtoken
|
||||||
for attr_name, attr_value in attrs.items():
|
for attr_name, attr_value in attrs.items():
|
||||||
if attr_name == TAG:
|
if attr_name == TAG:
|
||||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||||
else:
|
else:
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
|
||||||
# Make IOB consistent
|
# Make IOB consistent
|
||||||
if (orig_token.ent_iob == 3):
|
if (orig_token.ent_iob == 3):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
|
@ -391,22 +367,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
|
||||||
else:
|
else:
|
||||||
# In all other cases subtokens inherit iob from origToken
|
# In all other cases subtokens inherit iob from origToken
|
||||||
token.ent_iob = orig_token.ent_iob
|
token.ent_iob = orig_token.ent_iob
|
||||||
|
|
||||||
# Use the head of the new token everywhere. This will be partially overwritten later on.
|
# Use the head of the new token everywhere. This will be partially overwritten later on.
|
||||||
token.head = new_token_head
|
token.head = new_token_head
|
||||||
|
|
||||||
# Transform the dependencies into relative ones again
|
# Transform the dependencies into relative ones again
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head -= i
|
doc.c[i].head -= i
|
||||||
|
|
||||||
# Assign correct dependencies to the inner token
|
# Assign correct dependencies to the inner token
|
||||||
for i, head in enumerate(heads):
|
for i, head in enumerate(heads):
|
||||||
if head != 0:
|
if head != 0:
|
||||||
# the token's head's head is already correct
|
# the token's head's head is already correct
|
||||||
doc.c[token_index + i].head = head
|
doc.c[token_index + i].head = head
|
||||||
|
|
||||||
for i, dep in enumerate(deps):
|
for i, dep in enumerate(deps):
|
||||||
doc[token_index + i].dep = dep
|
doc[token_index + i].dep = dep
|
||||||
|
|
||||||
# set children from head
|
# set children from head
|
||||||
set_children_from_heads(doc.c, doc.length)
|
set_children_from_heads(doc.c, doc.length)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user