Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2018-12-30 15:49:57 +01:00
commit 3c09d3d986
5 changed files with 56 additions and 14 deletions

View File

@ -152,6 +152,9 @@ def test_span_as_doc(doc):
span = doc[4:10] span = doc[4:10]
span_doc = span.as_doc() span_doc = span.as_doc()
assert span.text == span_doc.text.strip() assert span.text == span_doc.text.strip()
assert isinstance(span_doc, doc.__class__)
assert span_doc is not doc
assert span_doc[0].idx == 0
def test_span_string_label(doc): def test_span_string_label(doc):

View File

@ -247,6 +247,16 @@ def test_issue1945():
assert matches[1][1:] == (1, 3) assert matches[1][1:] == (1, 3)
def test_issue1963(en_tokenizer):
"""Test that doc.merge() resizes doc.tensor"""
doc = en_tokenizer('a b c d')
doc.tensor = numpy.ones((len(doc), 128), dtype='f')
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
assert len(doc) == 3
assert doc.tensor.shape == (3, 128)
@pytest.mark.parametrize("label", ["U-JOB-NAME"]) @pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label): def test_issue1967(label):
ner = EntityRecognizer(Vocab()) ner = EntityRecognizer(Vocab())

View File

@ -7,7 +7,9 @@ from __future__ import unicode_literals
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, free from libc.stdlib cimport malloc, free
import numpy
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.neural.util import get_array_module
from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end
from .span cimport Span from .span cimport Span
@ -83,6 +85,11 @@ def _merge(Doc doc, int start, int end, attributes):
cdef Span span = doc[start:end] cdef Span span = doc[start:end]
cdef int start_char = span.start_char cdef int start_char = span.start_char
cdef int end_char = span.end_char cdef int end_char = span.end_char
# Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor, [(start, end)])
# Get LexemeC for newly merged token # Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span]) new_orth = ''.join([t.text_with_ws for t in span])
if span[-1].whitespace_: if span[-1].whitespace_:
@ -182,7 +189,12 @@ def _bulk_merge(Doc doc, merges):
else: else:
Token.set_struct_attr(token, attr_name, attr_value) Token.set_struct_attr(token, attr_name, attr_value)
# Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor,
[(m[1][0].start, m[1][0].end) for m in merges])
# Memorize span roots and sets dependencies of the newly merged # Memorize span roots and sets dependencies of the newly merged
# tokens to the dependencies of their roots. # tokens to the dependencies of their roots.
span_roots = [] span_roots = []
@ -276,6 +288,14 @@ def _bulk_merge(Doc doc, merges):
else: else:
# If they're not the same entity type, let them be two entities # If they're not the same entity type, let them be two entities
doc.c[token_after_span_position].ent_iob = 3 doc.c[token_after_span_position].ent_iob = 3
# Return the merged Python object # Return the merged Python object
return doc[spans[0].start] return doc[spans[0].start]
def _resize_tensor(tensor, ranges):
delete = []
for start, end in ranges:
for i in range(start, end-1):
delete.append(i)
xp = get_array_module(tensor)
return xp.delete(tensor, delete, axis=0)

View File

@ -14,7 +14,7 @@ from ..typedefs cimport flags_t, attr_t, hash_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from ..attrs cimport IS_PUNCT, IS_SPACE from ..attrs cimport *
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config, basestring_ from ..compat import is_config, basestring_
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
@ -149,23 +149,32 @@ cdef class Span:
def as_doc(self): def as_doc(self):
# TODO: fix # TODO: fix
"""Create a `Doc` object view of the Span's data. This is mostly """Create a `Doc` object with a copy of the Span's data.
useful for C-typed interfaces.
RETURNS (Doc): The `Doc` view of the span. RETURNS (Doc): The `Doc` copy of the span.
""" """
cdef Doc doc = Doc(self.doc.vocab) cdef Doc doc = Doc(self.doc.vocab,
doc.length = self.end-self.start words=[t.text for t in self],
doc.c = &self.doc.c[self.start] spaces=[bool(t.whitespace_) for t in self])
doc.mem = self.doc.mem array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
doc.is_parsed = self.doc.is_parsed if self.doc.is_tagged:
doc.is_tagged = self.doc.is_tagged array_head.append(TAG)
# if doc parsed add head and dep attribute
if self.doc.is_parsed:
array_head.extend([HEAD, DEP])
# otherwise add sent_start
else:
array_head.append(SENT_START)
array = self.doc.to_array(array_head)
doc.from_array(array_head, array[self.start : self.end])
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
doc.user_hooks = self.doc.user_hooks doc.user_hooks = self.doc.user_hooks
doc.user_span_hooks = self.doc.user_span_hooks doc.user_span_hooks = self.doc.user_span_hooks
doc.user_token_hooks = self.doc.user_token_hooks doc.user_token_hooks = self.doc.user_token_hooks
doc.vector = self.vector doc.vector = self.vector
doc.vector_norm = self.vector_norm doc.vector_norm = self.vector_norm
doc.tensor = self.doc.tensor[self.start : self.end]
for key, value in self.doc.cats.items(): for key, value in self.doc.cats.items():
if hasattr(key, '__len__') and len(key) == 3: if hasattr(key, '__len__') and len(key) == 3:
cat_start, cat_end, cat_label = key cat_start, cat_end, cat_label = key

View File

@ -377,8 +377,8 @@ p
+h(2, "as_doc") Span.as_doc +h(2, "as_doc") Span.as_doc
p p
| Create a #[code Doc] object view of the #[code Span]'s data. Mostly | Create a new #[code Doc] object corresponding to the #[code Span], with
| useful for C-typed interfaces. | a copy of the data.
+aside-code("Example"). +aside-code("Example").
doc = nlp(u'I like New York in Autumn.') doc = nlp(u'I like New York in Autumn.')