mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
* Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test
This commit is contained in:
parent
11d6b874db
commit
39815513e2
|
@ -253,6 +253,9 @@ class Errors(object):
|
||||||
E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
|
E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
|
||||||
" can only be part of one entity, so make sure the entities you're "
|
" can only be part of one entity, so make sure the entities you're "
|
||||||
"setting don't overlap.")
|
"setting don't overlap.")
|
||||||
|
E099 = ("The newly split token can only have one root (head = 0).")
|
||||||
|
E100 = ("The newly split token needs to have a root (head = 0)")
|
||||||
|
E101 = ("All subtokens must have associated heads")
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
|
|
114
spacy/tests/doc/test_doc_spilt.py
Normal file
114
spacy/tests/doc/test_doc_spilt.py
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..util import get_doc
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...tokens import Doc
|
||||||
|
from ...tokens import Span
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_split(en_tokenizer):
|
||||||
|
text = "LosAngeles start."
|
||||||
|
heads = [1, 1, 0]
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||||
|
|
||||||
|
assert len(doc) == 3
|
||||||
|
assert len(str(doc)) == 19
|
||||||
|
assert doc[0].head.text == 'start'
|
||||||
|
assert doc[1].head.text == '.'
|
||||||
|
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
|
||||||
|
|
||||||
|
assert len(doc) == 4
|
||||||
|
assert doc[0].text == 'Los'
|
||||||
|
assert doc[0].head.text == 'Angeles'
|
||||||
|
assert doc[0].idx == 0
|
||||||
|
assert doc[1].idx == 3
|
||||||
|
|
||||||
|
assert doc[1].text == 'Angeles'
|
||||||
|
assert doc[1].head.text == 'start'
|
||||||
|
|
||||||
|
assert doc[2].text == 'start'
|
||||||
|
assert doc[2].head.text == '.'
|
||||||
|
|
||||||
|
assert doc[3].text == '.'
|
||||||
|
assert doc[3].head.text == '.'
|
||||||
|
|
||||||
|
assert len(str(doc)) == 19
|
||||||
|
|
||||||
|
def test_split_dependencies(en_tokenizer):
|
||||||
|
text = "LosAngeles start."
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||||
|
dep1 = doc.vocab.strings.add('amod')
|
||||||
|
dep2 = doc.vocab.strings.add('subject')
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
|
||||||
|
|
||||||
|
assert doc[0].dep == dep1
|
||||||
|
assert doc[1].dep == dep2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_heads_error(en_tokenizer):
|
||||||
|
text = "LosAngeles start."
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||||
|
#Not enough heads
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(doc[0], ["Los", "Angeles"], [0])
|
||||||
|
|
||||||
|
#Too many heads
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
|
||||||
|
|
||||||
|
#No token head
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
|
||||||
|
|
||||||
|
#Several token heads
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
|
||||||
|
|
||||||
|
|
||||||
|
def test_spans_entity_merge_iob():
|
||||||
|
# Test entity IOB stays consistent after merging
|
||||||
|
words = ["abc", "d", "e"]
|
||||||
|
doc = Doc(Vocab(), words=words)
|
||||||
|
doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
|
||||||
|
assert doc[0].ent_iob_ == "B"
|
||||||
|
assert doc[1].ent_iob_ == "I"
|
||||||
|
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0])
|
||||||
|
assert doc[0].ent_iob_ == "B"
|
||||||
|
assert doc[1].ent_iob_ == "I"
|
||||||
|
assert doc[2].ent_iob_ == "I"
|
||||||
|
assert doc[3].ent_iob_ == "I"
|
||||||
|
|
||||||
|
def test_spans_sentence_update_after_merge(en_tokenizer):
|
||||||
|
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
|
||||||
|
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
|
||||||
|
deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
||||||
|
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
||||||
|
'compound', 'punct']
|
||||||
|
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
sent1, sent2 = list(doc.sents)
|
||||||
|
init_len = len(sent1)
|
||||||
|
init_len2 = len(sent2)
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0])
|
||||||
|
retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0])
|
||||||
|
sent1, sent2 = list(doc.sents)
|
||||||
|
assert len(sent1) == init_len + 1
|
||||||
|
assert len(sent2) == init_len2 + 1
|
|
@ -43,12 +43,12 @@ cdef class Retokenizer:
|
||||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
self.merges.append((span, attrs))
|
self.merges.append((span, attrs))
|
||||||
|
|
||||||
def split(self, Token token, orths, attrs=SimpleFrozenDict()):
|
def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()):
|
||||||
"""Mark a Token for splitting, into the specified orths. The attrs
|
"""Mark a Token for splitting, into the specified orths. The attrs
|
||||||
will be applied to each subtoken.
|
will be applied to each subtoken.
|
||||||
"""
|
"""
|
||||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
self.splits.append((token.start_char, orths, attrs))
|
self.splits.append((token.i, orths, heads, deps, attrs))
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
self.merges = []
|
self.merges = []
|
||||||
|
@ -65,8 +65,12 @@ cdef class Retokenizer:
|
||||||
end = span.end
|
end = span.end
|
||||||
_merge(self.doc, start, end, attrs)
|
_merge(self.doc, start, end, attrs)
|
||||||
|
|
||||||
for start_char, orths, attrs in self.splits:
|
offset = 0
|
||||||
raise NotImplementedError
|
# Iterate in order, to keep the offset simple.
|
||||||
|
for token_index, orths, heads, deps, attrs in sorted(self.splits):
|
||||||
|
_split(self.doc, token_index + offset, orths, heads, deps, attrs)
|
||||||
|
# Adjust for the previous tokens
|
||||||
|
offset += len(orths)-1
|
||||||
|
|
||||||
def _merge(Doc doc, int start, int end, attributes):
|
def _merge(Doc doc, int start, int end, attributes):
|
||||||
"""Retokenize the document, such that the span at
|
"""Retokenize the document, such that the span at
|
||||||
|
@ -279,3 +283,111 @@ def _bulk_merge(Doc doc, merges):
|
||||||
|
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return doc[spans[0].start]
|
return doc[spans[0].start]
|
||||||
|
|
||||||
|
|
||||||
|
def _split(Doc doc, int token_index, orths, heads, deps, attrs):
|
||||||
|
"""Retokenize the document, such that the token at
|
||||||
|
`doc[token_index]` is split into tokens with the orth 'orths'
|
||||||
|
token_index(int): token index of the token to split.
|
||||||
|
orths: IDs of the verbatim text content of the tokens to create
|
||||||
|
**attributes: Attributes to assign to each of the newly created tokens. By default,
|
||||||
|
attributes are inherited from the original token.
|
||||||
|
RETURNS (Token): The first newly created token.
|
||||||
|
"""
|
||||||
|
cdef int nb_subtokens = len(orths)
|
||||||
|
cdef const LexemeC* lex
|
||||||
|
cdef TokenC* token
|
||||||
|
cdef TokenC orig_token = doc.c[token_index]
|
||||||
|
|
||||||
|
if(len(heads) != nb_subtokens):
|
||||||
|
raise ValueError(Errors.E101)
|
||||||
|
token_head_index = -1
|
||||||
|
for index, head in enumerate(heads):
|
||||||
|
if head == 0:
|
||||||
|
if token_head_index != -1:
|
||||||
|
raise ValueError(Errors.E098)
|
||||||
|
token_head_index = index
|
||||||
|
if token_head_index == -1:
|
||||||
|
raise ValueError(Errors.E099)
|
||||||
|
|
||||||
|
# First, make the dependencies absolutes, and adjust all possible dependencies before
|
||||||
|
# creating the tokens
|
||||||
|
|
||||||
|
for i in range(doc.length):
|
||||||
|
doc.c[i].head += i
|
||||||
|
|
||||||
|
# Adjust dependencies
|
||||||
|
offset = nb_subtokens - 1
|
||||||
|
for i in range(doc.length):
|
||||||
|
head_idx = doc.c[i].head
|
||||||
|
if head_idx == token_index:
|
||||||
|
doc.c[i].head = token_head_index
|
||||||
|
elif head_idx > token_index:
|
||||||
|
doc.c[i].head += offset
|
||||||
|
|
||||||
|
new_token_head = doc.c[token_index].head
|
||||||
|
|
||||||
|
# Double doc.c max_length if necessary (until big enough for all new tokens)
|
||||||
|
while doc.length + nb_subtokens - 1 >= doc.max_length:
|
||||||
|
doc._realloc(doc.length * 2)
|
||||||
|
|
||||||
|
# Move tokens after the split to create space for the new tokens
|
||||||
|
doc.length = len(doc) + nb_subtokens -1
|
||||||
|
for token_to_move in range(doc.length - 1, token_index, -1):
|
||||||
|
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
|
||||||
|
|
||||||
|
# Host the tokens in the newly created space
|
||||||
|
cdef int idx_offset = 0
|
||||||
|
for i, orth in enumerate(orths):
|
||||||
|
|
||||||
|
token = &doc.c[token_index + i]
|
||||||
|
lex = doc.vocab.get(doc.mem, orth)
|
||||||
|
token.lex = lex
|
||||||
|
# Update the character offset of the subtokens
|
||||||
|
if i != 0:
|
||||||
|
token.idx = orig_token.idx + idx_offset
|
||||||
|
idx_offset += len(orth)
|
||||||
|
|
||||||
|
# Set token.spacy to False for all non-last split tokens, and
|
||||||
|
# to origToken.spacy for the last token
|
||||||
|
if (i < nb_subtokens - 1):
|
||||||
|
token.spacy = False
|
||||||
|
else:
|
||||||
|
token.spacy = orig_token.spacy
|
||||||
|
|
||||||
|
# Apply attrs to each subtoken
|
||||||
|
for attr_name, attr_value in attrs.items():
|
||||||
|
if attr_name == TAG:
|
||||||
|
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||||
|
else:
|
||||||
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
|
||||||
|
# Make IOB consistent
|
||||||
|
if (orig_token.ent_iob == 3):
|
||||||
|
if i == 0:
|
||||||
|
token.ent_iob = 3
|
||||||
|
else:
|
||||||
|
token.ent_iob = 1
|
||||||
|
else:
|
||||||
|
# In all other cases subtokens inherit iob from origToken
|
||||||
|
token.ent_iob = orig_token.ent_iob
|
||||||
|
|
||||||
|
# Use the head of the new token everywhere. This will be partially overwritten later on.
|
||||||
|
token.head = new_token_head
|
||||||
|
|
||||||
|
# Transform the dependencies into relative ones again
|
||||||
|
for i in range(doc.length):
|
||||||
|
doc.c[i].head -= i
|
||||||
|
|
||||||
|
# Assign correct dependencies to the inner token
|
||||||
|
for i, head in enumerate(heads):
|
||||||
|
if head != 0:
|
||||||
|
# the token's head's head is already correct
|
||||||
|
doc.c[token_index + i].head = head
|
||||||
|
|
||||||
|
for i, dep in enumerate(deps):
|
||||||
|
doc[token_index + i].dep = dep
|
||||||
|
|
||||||
|
# set children from head
|
||||||
|
set_children_from_heads(doc.c, doc.length)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user