mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
3c09d3d986
|
@ -152,6 +152,9 @@ def test_span_as_doc(doc):
|
||||||
span = doc[4:10]
|
span = doc[4:10]
|
||||||
span_doc = span.as_doc()
|
span_doc = span.as_doc()
|
||||||
assert span.text == span_doc.text.strip()
|
assert span.text == span_doc.text.strip()
|
||||||
|
assert isinstance(span_doc, doc.__class__)
|
||||||
|
assert span_doc is not doc
|
||||||
|
assert span_doc[0].idx == 0
|
||||||
|
|
||||||
|
|
||||||
def test_span_string_label(doc):
|
def test_span_string_label(doc):
|
||||||
|
|
|
@ -247,6 +247,16 @@ def test_issue1945():
|
||||||
assert matches[1][1:] == (1, 3)
|
assert matches[1][1:] == (1, 3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue1963(en_tokenizer):
|
||||||
|
"""Test that doc.merge() resizes doc.tensor"""
|
||||||
|
doc = en_tokenizer('a b c d')
|
||||||
|
doc.tensor = numpy.ones((len(doc), 128), dtype='f')
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[0:2])
|
||||||
|
assert len(doc) == 3
|
||||||
|
assert doc.tensor.shape == (3, 128)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||||
def test_issue1967(label):
|
def test_issue1967(label):
|
||||||
ner = EntityRecognizer(Vocab())
|
ner = EntityRecognizer(Vocab())
|
||||||
|
|
|
@ -7,7 +7,9 @@ from __future__ import unicode_literals
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdlib cimport malloc, free
|
from libc.stdlib cimport malloc, free
|
||||||
|
|
||||||
|
import numpy
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end
|
from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
|
@ -83,6 +85,11 @@ def _merge(Doc doc, int start, int end, attributes):
|
||||||
cdef Span span = doc[start:end]
|
cdef Span span = doc[start:end]
|
||||||
cdef int start_char = span.start_char
|
cdef int start_char = span.start_char
|
||||||
cdef int end_char = span.end_char
|
cdef int end_char = span.end_char
|
||||||
|
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||||
|
# for the merged region. To do this, we create a boolean array indicating
|
||||||
|
# whether the row is to be deleted, then use numpy.delete
|
||||||
|
if doc.tensor is not None and doc.tensor.size != 0:
|
||||||
|
doc.tensor = _resize_tensor(doc.tensor, [(start, end)])
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = ''.join([t.text_with_ws for t in span])
|
new_orth = ''.join([t.text_with_ws for t in span])
|
||||||
if span[-1].whitespace_:
|
if span[-1].whitespace_:
|
||||||
|
@ -182,7 +189,12 @@ def _bulk_merge(Doc doc, merges):
|
||||||
else:
|
else:
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
|
||||||
|
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||||
|
# for the merged region. To do this, we create a boolean array indicating
|
||||||
|
# whether the row is to be deleted, then use numpy.delete
|
||||||
|
if doc.tensor is not None and doc.tensor.size != 0:
|
||||||
|
doc.tensor = _resize_tensor(doc.tensor,
|
||||||
|
[(m[1][0].start, m[1][0].end) for m in merges])
|
||||||
# Memorize span roots and sets dependencies of the newly merged
|
# Memorize span roots and sets dependencies of the newly merged
|
||||||
# tokens to the dependencies of their roots.
|
# tokens to the dependencies of their roots.
|
||||||
span_roots = []
|
span_roots = []
|
||||||
|
@ -276,6 +288,14 @@ def _bulk_merge(Doc doc, merges):
|
||||||
else:
|
else:
|
||||||
# If they're not the same entity type, let them be two entities
|
# If they're not the same entity type, let them be two entities
|
||||||
doc.c[token_after_span_position].ent_iob = 3
|
doc.c[token_after_span_position].ent_iob = 3
|
||||||
|
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return doc[spans[0].start]
|
return doc[spans[0].start]
|
||||||
|
|
||||||
|
|
||||||
|
def _resize_tensor(tensor, ranges):
|
||||||
|
delete = []
|
||||||
|
for start, end in ranges:
|
||||||
|
for i in range(start, end-1):
|
||||||
|
delete.append(i)
|
||||||
|
xp = get_array_module(tensor)
|
||||||
|
return xp.delete(tensor, delete, axis=0)
|
||||||
|
|
|
@ -14,7 +14,7 @@ from ..typedefs cimport flags_t, attr_t, hash_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..attrs cimport IS_PUNCT, IS_SPACE
|
from ..attrs cimport *
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..compat import is_config, basestring_
|
from ..compat import is_config, basestring_
|
||||||
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
|
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
|
||||||
|
@ -149,23 +149,32 @@ cdef class Span:
|
||||||
|
|
||||||
def as_doc(self):
|
def as_doc(self):
|
||||||
# TODO: fix
|
# TODO: fix
|
||||||
"""Create a `Doc` object view of the Span's data. This is mostly
|
"""Create a `Doc` object with a copy of the Span's data.
|
||||||
useful for C-typed interfaces.
|
|
||||||
|
|
||||||
RETURNS (Doc): The `Doc` view of the span.
|
RETURNS (Doc): The `Doc` copy of the span.
|
||||||
"""
|
"""
|
||||||
cdef Doc doc = Doc(self.doc.vocab)
|
cdef Doc doc = Doc(self.doc.vocab,
|
||||||
doc.length = self.end-self.start
|
words=[t.text for t in self],
|
||||||
doc.c = &self.doc.c[self.start]
|
spaces=[bool(t.whitespace_) for t in self])
|
||||||
doc.mem = self.doc.mem
|
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
|
||||||
doc.is_parsed = self.doc.is_parsed
|
if self.doc.is_tagged:
|
||||||
doc.is_tagged = self.doc.is_tagged
|
array_head.append(TAG)
|
||||||
|
# if doc parsed add head and dep attribute
|
||||||
|
if self.doc.is_parsed:
|
||||||
|
array_head.extend([HEAD, DEP])
|
||||||
|
# otherwise add sent_start
|
||||||
|
else:
|
||||||
|
array_head.append(SENT_START)
|
||||||
|
array = self.doc.to_array(array_head)
|
||||||
|
doc.from_array(array_head, array[self.start : self.end])
|
||||||
|
|
||||||
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
|
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
|
||||||
doc.user_hooks = self.doc.user_hooks
|
doc.user_hooks = self.doc.user_hooks
|
||||||
doc.user_span_hooks = self.doc.user_span_hooks
|
doc.user_span_hooks = self.doc.user_span_hooks
|
||||||
doc.user_token_hooks = self.doc.user_token_hooks
|
doc.user_token_hooks = self.doc.user_token_hooks
|
||||||
doc.vector = self.vector
|
doc.vector = self.vector
|
||||||
doc.vector_norm = self.vector_norm
|
doc.vector_norm = self.vector_norm
|
||||||
|
doc.tensor = self.doc.tensor[self.start : self.end]
|
||||||
for key, value in self.doc.cats.items():
|
for key, value in self.doc.cats.items():
|
||||||
if hasattr(key, '__len__') and len(key) == 3:
|
if hasattr(key, '__len__') and len(key) == 3:
|
||||||
cat_start, cat_end, cat_label = key
|
cat_start, cat_end, cat_label = key
|
||||||
|
|
|
@ -377,8 +377,8 @@ p
|
||||||
+h(2, "as_doc") Span.as_doc
|
+h(2, "as_doc") Span.as_doc
|
||||||
|
|
||||||
p
|
p
|
||||||
| Create a #[code Doc] object view of the #[code Span]'s data. Mostly
|
| Create a new #[code Doc] object corresponding to the #[code Span], with
|
||||||
| useful for C-typed interfaces.
|
| a copy of the data.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
doc = nlp(u'I like New York in Autumn.')
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user