mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Fix dependency copy for as_doc (#3969)
* failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences
This commit is contained in:
parent
a32b033b8c
commit
ba02957c80
112
spacy/tests/regression/test_issue3962.py
Normal file
112
spacy/tests/regression/test_issue3962.py
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc(en_tokenizer):
|
||||||
|
text = "He jests at scars, that never felt a wound."
|
||||||
|
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||||
|
deps = [
|
||||||
|
"nsubj",
|
||||||
|
"ccomp",
|
||||||
|
"prep",
|
||||||
|
"pobj",
|
||||||
|
"punct",
|
||||||
|
"nsubj",
|
||||||
|
"neg",
|
||||||
|
"ROOT",
|
||||||
|
"det",
|
||||||
|
"dobj",
|
||||||
|
"punct",
|
||||||
|
]
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3962(doc):
|
||||||
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
|
span2 = doc[1:5] # "jests at scars ,"
|
||||||
|
doc2 = span2.as_doc()
|
||||||
|
doc2_json = doc2.to_json()
|
||||||
|
assert doc2_json
|
||||||
|
|
||||||
|
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root
|
||||||
|
assert doc2[0].dep_ == "dep"
|
||||||
|
assert doc2[1].head.text == "jests"
|
||||||
|
assert doc2[1].dep_ == "prep"
|
||||||
|
assert doc2[2].head.text == "at"
|
||||||
|
assert doc2[2].dep_ == "pobj"
|
||||||
|
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||||
|
assert doc2[3].dep_ == "dep"
|
||||||
|
|
||||||
|
# We should still have 1 sentence
|
||||||
|
assert len(list(doc2.sents)) == 1
|
||||||
|
|
||||||
|
span3 = doc[6:9] # "never felt a"
|
||||||
|
doc3 = span3.as_doc()
|
||||||
|
doc3_json = doc3.to_json()
|
||||||
|
assert doc3_json
|
||||||
|
|
||||||
|
assert doc3[0].head.text == "felt"
|
||||||
|
assert doc3[0].dep_ == "neg"
|
||||||
|
assert doc3[1].head.text == "felt"
|
||||||
|
assert doc3[1].dep_ == "ROOT"
|
||||||
|
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||||
|
assert doc3[2].dep_ == "dep"
|
||||||
|
|
||||||
|
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||||
|
assert len(list(doc3.sents)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def two_sent_doc(en_tokenizer):
|
||||||
|
text = "He jests at scars. They never felt a wound."
|
||||||
|
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||||
|
deps = [
|
||||||
|
"nsubj",
|
||||||
|
"ROOT",
|
||||||
|
"prep",
|
||||||
|
"pobj",
|
||||||
|
"punct",
|
||||||
|
"nsubj",
|
||||||
|
"neg",
|
||||||
|
"ROOT",
|
||||||
|
"det",
|
||||||
|
"dobj",
|
||||||
|
"punct",
|
||||||
|
]
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3962_long(two_sent_doc):
|
||||||
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
|
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||||
|
doc2 = span2.as_doc()
|
||||||
|
doc2_json = doc2.to_json()
|
||||||
|
assert doc2_json
|
||||||
|
|
||||||
|
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1)
|
||||||
|
assert doc2[0].dep_ == "ROOT"
|
||||||
|
assert doc2[1].head.text == "jests"
|
||||||
|
assert doc2[1].dep_ == "prep"
|
||||||
|
assert doc2[2].head.text == "at"
|
||||||
|
assert doc2[2].dep_ == "pobj"
|
||||||
|
assert doc2[3].head.text == "jests"
|
||||||
|
assert doc2[3].dep_ == "punct"
|
||||||
|
assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2)
|
||||||
|
assert doc2[4].dep_ == "dep"
|
||||||
|
assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2)
|
||||||
|
assert doc2[4].dep_ == "dep"
|
||||||
|
|
||||||
|
# We should still have 2 sentences
|
||||||
|
sents = list(doc2.sents)
|
||||||
|
assert len(sents) == 2
|
||||||
|
assert sents[0].text == "jests at scars ."
|
||||||
|
assert sents[1].text == "They never"
|
|
@ -794,7 +794,7 @@ cdef class Doc:
|
||||||
if array[i, col] != 0:
|
if array[i, col] != 0:
|
||||||
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
||||||
# Now load the data
|
# Now load the data
|
||||||
for i in range(self.length):
|
for i in range(length):
|
||||||
token = &self.c[i]
|
token = &self.c[i]
|
||||||
for j in range(n_attrs):
|
for j in range(n_attrs):
|
||||||
if attr_ids[j] != TAG:
|
if attr_ids[j] != TAG:
|
||||||
|
@ -804,7 +804,7 @@ cdef class Doc:
|
||||||
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
|
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
|
||||||
# If document is parsed, set children
|
# If document is parsed, set children
|
||||||
if self.is_parsed:
|
if self.is_parsed:
|
||||||
set_children_from_heads(self.c, self.length)
|
set_children_from_heads(self.c, length)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get_lca_matrix(self):
|
def get_lca_matrix(self):
|
||||||
|
|
|
@ -17,6 +17,7 @@ from ..attrs cimport attr_id_t
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..attrs cimport *
|
from ..attrs cimport *
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
from ..symbols cimport dep
|
||||||
|
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..compat import is_config, basestring_
|
from ..compat import is_config, basestring_
|
||||||
|
@ -206,7 +207,6 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#as_doc
|
DOCS: https://spacy.io/api/span#as_doc
|
||||||
"""
|
"""
|
||||||
# TODO: Fix!
|
|
||||||
words = [t.text for t in self]
|
words = [t.text for t in self]
|
||||||
spaces = [bool(t.whitespace_) for t in self]
|
spaces = [bool(t.whitespace_) for t in self]
|
||||||
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
|
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
|
||||||
|
@ -220,7 +220,9 @@ cdef class Span:
|
||||||
else:
|
else:
|
||||||
array_head.append(SENT_START)
|
array_head.append(SENT_START)
|
||||||
array = self.doc.to_array(array_head)
|
array = self.doc.to_array(array_head)
|
||||||
doc.from_array(array_head, array[self.start : self.end])
|
array = array[self.start : self.end]
|
||||||
|
self._fix_dep_copy(array_head, array)
|
||||||
|
doc.from_array(array_head, array)
|
||||||
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
|
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
|
||||||
doc.user_hooks = self.doc.user_hooks
|
doc.user_hooks = self.doc.user_hooks
|
||||||
doc.user_span_hooks = self.doc.user_span_hooks
|
doc.user_span_hooks = self.doc.user_span_hooks
|
||||||
|
@ -235,6 +237,44 @@ cdef class Span:
|
||||||
doc.cats[cat_label] = value
|
doc.cats[cat_label] = value
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
def _fix_dep_copy(self, attrs, array):
|
||||||
|
""" Rewire dependency links to make sure their heads fall into the span
|
||||||
|
while still keeping the correct number of sentences. """
|
||||||
|
cdef int length = len(array)
|
||||||
|
cdef attr_t value
|
||||||
|
cdef int i, head_col, ancestor_i
|
||||||
|
old_to_new_root = dict()
|
||||||
|
if HEAD in attrs:
|
||||||
|
head_col = attrs.index(HEAD)
|
||||||
|
for i in range(length):
|
||||||
|
# if the HEAD refers to a token outside this span, find a more appropriate ancestor
|
||||||
|
token = self[i]
|
||||||
|
ancestor_i = token.head.i - self.start # span offset
|
||||||
|
if ancestor_i not in range(length):
|
||||||
|
if DEP in attrs:
|
||||||
|
array[i, attrs.index(DEP)] = dep
|
||||||
|
|
||||||
|
# try finding an ancestor within this span
|
||||||
|
ancestors = token.ancestors
|
||||||
|
for ancestor in ancestors:
|
||||||
|
ancestor_i = ancestor.i - self.start
|
||||||
|
if ancestor_i in range(length):
|
||||||
|
array[i, head_col] = ancestor_i - i
|
||||||
|
|
||||||
|
# if there is no appropriate ancestor, define a new artificial root
|
||||||
|
value = array[i, head_col]
|
||||||
|
if (i+value) not in range(length):
|
||||||
|
new_root = old_to_new_root.get(ancestor_i, None)
|
||||||
|
if new_root is not None:
|
||||||
|
# take the same artificial root as a previous token from the same sentence
|
||||||
|
array[i, head_col] = new_root - i
|
||||||
|
else:
|
||||||
|
# set this token as the new artificial root
|
||||||
|
array[i, head_col] = 0
|
||||||
|
old_to_new_root[ancestor_i] = i
|
||||||
|
|
||||||
|
return array
|
||||||
|
|
||||||
def merge(self, *args, **attributes):
|
def merge(self, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span is merged into a single
|
"""Retokenize the document, such that the span is merged into a single
|
||||||
token.
|
token.
|
||||||
|
@ -500,7 +540,7 @@ cdef class Span:
|
||||||
if "root" in self.doc.user_span_hooks:
|
if "root" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["root"](self)
|
return self.doc.user_span_hooks["root"](self)
|
||||||
# This should probably be called 'head', and the other one called
|
# This should probably be called 'head', and the other one called
|
||||||
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
|
# 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
|
||||||
cdef int i
|
cdef int i
|
||||||
# First, we scan through the Span, and check whether there's a word
|
# First, we scan through the Span, and check whether there's a word
|
||||||
# with head==0, i.e. a sentence root. If so, we can return it. The
|
# with head==0, i.e. a sentence root. If so, we can return it. The
|
||||||
|
|
Loading…
Reference in New Issue
Block a user