mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-04 06:16:33 +03:00
Merge remote-tracking branch 'upstream/v4' into store-activations
This commit is contained in:
commit
df513beddb
|
@ -256,6 +256,10 @@ cdef class Matcher:
|
||||||
# non-overlapping ones this `match` can be either (start, end) or
|
# non-overlapping ones this `match` can be either (start, end) or
|
||||||
# (start, end, alignments) depending on `with_alignments=` option.
|
# (start, end, alignments) depending on `with_alignments=` option.
|
||||||
for key, *match in matches:
|
for key, *match in matches:
|
||||||
|
# Adjust span matches to doc offsets
|
||||||
|
if isinstance(doclike, Span):
|
||||||
|
match[0] += doclike.start
|
||||||
|
match[1] += doclike.start
|
||||||
span_filter = self._filter.get(key)
|
span_filter = self._filter.get(key)
|
||||||
if span_filter is not None:
|
if span_filter is not None:
|
||||||
pairs = pairs_by_id.get(key, [])
|
pairs = pairs_by_id.get(key, [])
|
||||||
|
@ -286,9 +290,6 @@ cdef class Matcher:
|
||||||
if as_spans:
|
if as_spans:
|
||||||
final_results = []
|
final_results = []
|
||||||
for key, start, end, *_ in final_matches:
|
for key, start, end, *_ in final_matches:
|
||||||
if isinstance(doclike, Span):
|
|
||||||
start += doclike.start
|
|
||||||
end += doclike.start
|
|
||||||
final_results.append(Span(doc, start, end, label=key))
|
final_results.append(Span(doc, start, end, label=key))
|
||||||
elif with_alignments:
|
elif with_alignments:
|
||||||
# convert alignments List[Dict[str, int]] --> List[int]
|
# convert alignments List[Dict[str, int]] --> List[int]
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
from libcpp.memory cimport shared_ptr
|
||||||
|
from libcpp.vector cimport vector
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
@ -43,9 +45,7 @@ MOVE_NAMES[OUT] = 'O'
|
||||||
|
|
||||||
cdef struct GoldNERStateC:
|
cdef struct GoldNERStateC:
|
||||||
Transition* ner
|
Transition* ner
|
||||||
SpanC* negs
|
vector[shared_ptr[SpanC]] negs
|
||||||
int32_t length
|
|
||||||
int32_t nr_neg
|
|
||||||
|
|
||||||
|
|
||||||
cdef class BiluoGold:
|
cdef class BiluoGold:
|
||||||
|
@ -78,8 +78,6 @@ cdef GoldNERStateC create_gold_state(
|
||||||
negs = []
|
negs = []
|
||||||
assert example.x.length > 0
|
assert example.x.length > 0
|
||||||
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
|
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
|
||||||
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
|
|
||||||
gs.nr_neg = len(negs)
|
|
||||||
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
|
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
|
||||||
for i, ner_tag in enumerate(ner_tags):
|
for i, ner_tag in enumerate(ner_tags):
|
||||||
gs.ner[i] = moves.lookup_transition(ner_tag)
|
gs.ner[i] = moves.lookup_transition(ner_tag)
|
||||||
|
@ -93,8 +91,8 @@ cdef GoldNERStateC create_gold_state(
|
||||||
# In order to handle negative samples, we need to maintain the full
|
# In order to handle negative samples, we need to maintain the full
|
||||||
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
|
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
|
||||||
# thing, we'll get blocked if there's an incorrect prefix.
|
# thing, we'll get blocked if there's an incorrect prefix.
|
||||||
for i, neg in enumerate(negs):
|
for neg in negs:
|
||||||
gs.negs[i] = neg.c
|
gs.negs.push_back(neg.c)
|
||||||
return gs
|
return gs
|
||||||
|
|
||||||
|
|
||||||
|
@ -411,6 +409,8 @@ cdef class Begin:
|
||||||
cdef int g_act = gold.ner[b0].move
|
cdef int g_act = gold.ner[b0].move
|
||||||
cdef attr_t g_tag = gold.ner[b0].label
|
cdef attr_t g_tag = gold.ner[b0].label
|
||||||
|
|
||||||
|
cdef shared_ptr[SpanC] span
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
pass
|
pass
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
|
@ -428,8 +428,8 @@ cdef class Begin:
|
||||||
# be correct or not. However, we can at least tell whether we're
|
# be correct or not. However, we can at least tell whether we're
|
||||||
# going to be opening an entity where there's only one possible
|
# going to be opening an entity where there's only one possible
|
||||||
# L.
|
# L.
|
||||||
for span in gold.negs[:gold.nr_neg]:
|
for span in gold.negs:
|
||||||
if span.label == label and span.start == b0:
|
if span.get().label == label and span.get().start == b0:
|
||||||
cost += 1
|
cost += 1
|
||||||
break
|
break
|
||||||
return cost
|
return cost
|
||||||
|
@ -574,8 +574,9 @@ cdef class Last:
|
||||||
# If we have negative-example entities, integrate them into the objective,
|
# If we have negative-example entities, integrate them into the objective,
|
||||||
# by marking actions that close an entity that we know is incorrect
|
# by marking actions that close an entity that we know is incorrect
|
||||||
# as costly.
|
# as costly.
|
||||||
for span in gold.negs[:gold.nr_neg]:
|
cdef shared_ptr[SpanC] span
|
||||||
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
|
for span in gold.negs:
|
||||||
|
if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
|
||||||
cost += 1
|
cost += 1
|
||||||
break
|
break
|
||||||
return cost
|
return cost
|
||||||
|
@ -639,8 +640,9 @@ cdef class Unit:
|
||||||
# This is fairly straight-forward for U- entities, as we have a single
|
# This is fairly straight-forward for U- entities, as we have a single
|
||||||
# action
|
# action
|
||||||
cdef int b0 = s.B(0)
|
cdef int b0 = s.B(0)
|
||||||
for span in gold.negs[:gold.nr_neg]:
|
cdef shared_ptr[SpanC] span
|
||||||
if span.label == label and span.start == b0 and span.end == (b0+1):
|
for span in gold.negs:
|
||||||
|
if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
|
||||||
cost += 1
|
cost += 1
|
||||||
break
|
break
|
||||||
return cost
|
return cost
|
||||||
|
|
|
@ -4,7 +4,7 @@ from numpy.testing import assert_array_equal
|
||||||
|
|
||||||
from spacy.attrs import ORTH, LENGTH
|
from spacy.attrs import ORTH, LENGTH
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, Span, Token
|
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.util import filter_spans
|
from spacy.util import filter_spans
|
||||||
from thinc.api import get_current_ops
|
from thinc.api import get_current_ops
|
||||||
|
@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
|
||||||
assert span.text == text
|
assert span.text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(9556)
|
||||||
|
def test_modify_span_group(doc):
|
||||||
|
group = SpanGroup(doc, spans=doc.ents)
|
||||||
|
for span in group:
|
||||||
|
span.start = 0
|
||||||
|
span.label = doc.vocab.strings["TEST"]
|
||||||
|
|
||||||
|
# Span changes must be reflected in the span group
|
||||||
|
assert group[0].start == 0
|
||||||
|
assert group[0].label == doc.vocab.strings["TEST"]
|
||||||
|
|
||||||
|
|
||||||
def test_spans_sent_spans(doc):
|
def test_spans_sent_spans(doc):
|
||||||
sents = list(doc.sents)
|
sents = list(doc.sents)
|
||||||
assert sents[0].start == 0
|
assert sents[0].start == 0
|
||||||
|
|
|
@ -99,8 +99,10 @@ def test_span_group_set_item(doc, other_doc):
|
||||||
span.label_ = "NEW LABEL"
|
span.label_ = "NEW LABEL"
|
||||||
span.kb_id = doc.vocab.strings["KB_ID"]
|
span.kb_id = doc.vocab.strings["KB_ID"]
|
||||||
|
|
||||||
assert span_group[index].label != span.label
|
# Indexing a span group returns a span in which C
|
||||||
assert span_group[index].kb_id != span.kb_id
|
# data is shared.
|
||||||
|
assert span_group[index].label == span.label
|
||||||
|
assert span_group[index].kb_id == span.kb_id
|
||||||
|
|
||||||
span_group[index] = span
|
span_group[index] = span
|
||||||
assert span_group[index].start == span.start
|
assert span_group[index].start == span.start
|
||||||
|
|
|
@ -602,9 +602,16 @@ def test_matcher_span(matcher):
|
||||||
doc = Doc(matcher.vocab, words=text.split())
|
doc = Doc(matcher.vocab, words=text.split())
|
||||||
span_js = doc[:3]
|
span_js = doc[:3]
|
||||||
span_java = doc[4:]
|
span_java = doc[4:]
|
||||||
assert len(matcher(doc)) == 2
|
doc_matches = matcher(doc)
|
||||||
assert len(matcher(span_js)) == 1
|
span_js_matches = matcher(span_js)
|
||||||
assert len(matcher(span_java)) == 1
|
span_java_matches = matcher(span_java)
|
||||||
|
assert len(doc_matches) == 2
|
||||||
|
assert len(span_js_matches) == 1
|
||||||
|
assert len(span_java_matches) == 1
|
||||||
|
|
||||||
|
# match offsets always refer to the doc
|
||||||
|
assert doc_matches[0] == span_js_matches[0]
|
||||||
|
assert doc_matches[1] == span_java_matches[0]
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_as_spans(matcher):
|
def test_matcher_as_spans(matcher):
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from libcpp.memory cimport shared_ptr
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
from .doc cimport Doc
|
from .doc cimport Doc
|
||||||
|
@ -7,19 +8,21 @@ from ..structs cimport SpanC
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
cdef readonly Doc doc
|
cdef readonly Doc doc
|
||||||
cdef SpanC c
|
cdef shared_ptr[SpanC] c
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
cdef public _vector_norm
|
cdef public _vector_norm
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline Span cinit(Doc doc, SpanC span):
|
cdef inline Span cinit(Doc doc, const shared_ptr[SpanC] &span):
|
||||||
cdef Span self = Span.__new__(
|
cdef Span self = Span.__new__(
|
||||||
Span,
|
Span,
|
||||||
doc,
|
doc,
|
||||||
start=span.start,
|
start=span.get().start,
|
||||||
end=span.end
|
end=span.get().end
|
||||||
)
|
)
|
||||||
self.c = span
|
self.c = span
|
||||||
return self
|
return self
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object features)
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
||||||
|
cdef SpanC* span_c(self)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
from libcpp.memory cimport make_shared
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
|
@ -114,7 +115,7 @@ cdef class Span:
|
||||||
end_char = start_char
|
end_char = start_char
|
||||||
else:
|
else:
|
||||||
end_char = doc[end - 1].idx + len(doc[end - 1])
|
end_char = doc[end - 1].idx + len(doc[end - 1])
|
||||||
self.c = SpanC(
|
self.c = make_shared[SpanC](SpanC(
|
||||||
label=label,
|
label=label,
|
||||||
kb_id=kb_id,
|
kb_id=kb_id,
|
||||||
id=span_id,
|
id=span_id,
|
||||||
|
@ -122,7 +123,7 @@ cdef class Span:
|
||||||
end=end,
|
end=end,
|
||||||
start_char=start_char,
|
start_char=start_char,
|
||||||
end_char=end_char,
|
end_char=end_char,
|
||||||
)
|
))
|
||||||
self._vector = vector
|
self._vector = vector
|
||||||
self._vector_norm = vector_norm
|
self._vector_norm = vector_norm
|
||||||
|
|
||||||
|
@ -132,8 +133,11 @@ cdef class Span:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
|
|
||||||
other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.id, other.doc)
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
cdef SpanC* other_span_c = other.span_c()
|
||||||
|
self_tuple = (span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, self.id, self.doc)
|
||||||
|
other_tuple = (other_span_c.start_char, other_span_c.end_char, other_span_c.label, other_span_c.kb_id, other.id, other.doc)
|
||||||
# <
|
# <
|
||||||
if op == 0:
|
if op == 0:
|
||||||
return self_tuple < other_tuple
|
return self_tuple < other_tuple
|
||||||
|
@ -154,7 +158,8 @@ cdef class Span:
|
||||||
return self_tuple >= other_tuple
|
return self_tuple >= other_tuple
|
||||||
|
|
||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.c.id))
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, span_c.id))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""Get the number of tokens in the span.
|
"""Get the number of tokens in the span.
|
||||||
|
@ -163,9 +168,10 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#len
|
DOCS: https://spacy.io/api/span#len
|
||||||
"""
|
"""
|
||||||
if self.c.end < self.c.start:
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
if span_c.end < span_c.start:
|
||||||
return 0
|
return 0
|
||||||
return self.c.end - self.c.start
|
return span_c.end - span_c.start
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.text
|
return self.text
|
||||||
|
@ -179,15 +185,16 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#getitem
|
DOCS: https://spacy.io/api/span#getitem
|
||||||
"""
|
"""
|
||||||
|
cdef SpanC* span_c = self.span_c()
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
return Span(self.doc, start + self.start, end + self.start)
|
return Span(self.doc, start + self.start, end + self.start)
|
||||||
else:
|
else:
|
||||||
if i < 0:
|
if i < 0:
|
||||||
token_i = self.c.end + i
|
token_i = span_c.end + i
|
||||||
else:
|
else:
|
||||||
token_i = self.c.start + i
|
token_i = span_c.start + i
|
||||||
if self.c.start <= token_i < self.c.end:
|
if span_c.start <= token_i < span_c.end:
|
||||||
return self.doc[token_i]
|
return self.doc[token_i]
|
||||||
else:
|
else:
|
||||||
raise IndexError(Errors.E1002)
|
raise IndexError(Errors.E1002)
|
||||||
|
@ -199,7 +206,8 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#iter
|
DOCS: https://spacy.io/api/span#iter
|
||||||
"""
|
"""
|
||||||
for i in range(self.c.start, self.c.end):
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
for i in range(span_c.start, span_c.end):
|
||||||
yield self.doc[i]
|
yield self.doc[i]
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
@ -207,9 +215,10 @@ cdef class Span:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _(self):
|
def _(self):
|
||||||
|
cdef SpanC* span_c = self.span_c()
|
||||||
"""Custom extension attributes registered via `set_extension`."""
|
"""Custom extension attributes registered via `set_extension`."""
|
||||||
return Underscore(Underscore.span_extensions, self,
|
return Underscore(Underscore.span_extensions, self,
|
||||||
start=self.c.start_char, end=self.c.end_char)
|
start=span_c.start_char, end=span_c.end_char)
|
||||||
|
|
||||||
def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
|
def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
|
||||||
"""Create a `Doc` object with a copy of the `Span`'s data.
|
"""Create a `Doc` object with a copy of the `Span`'s data.
|
||||||
|
@ -283,13 +292,14 @@ cdef class Span:
|
||||||
cdef int length = len(array)
|
cdef int length = len(array)
|
||||||
cdef attr_t value
|
cdef attr_t value
|
||||||
cdef int i, head_col, ancestor_i
|
cdef int i, head_col, ancestor_i
|
||||||
|
cdef SpanC* span_c = self.span_c()
|
||||||
old_to_new_root = dict()
|
old_to_new_root = dict()
|
||||||
if HEAD in attrs:
|
if HEAD in attrs:
|
||||||
head_col = attrs.index(HEAD)
|
head_col = attrs.index(HEAD)
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
# if the HEAD refers to a token outside this span, find a more appropriate ancestor
|
# if the HEAD refers to a token outside this span, find a more appropriate ancestor
|
||||||
token = self[i]
|
token = self[i]
|
||||||
ancestor_i = token.head.i - self.c.start # span offset
|
ancestor_i = token.head.i - span_c.start # span offset
|
||||||
if ancestor_i not in range(length):
|
if ancestor_i not in range(length):
|
||||||
if DEP in attrs:
|
if DEP in attrs:
|
||||||
array[i, attrs.index(DEP)] = dep
|
array[i, attrs.index(DEP)] = dep
|
||||||
|
@ -297,7 +307,7 @@ cdef class Span:
|
||||||
# try finding an ancestor within this span
|
# try finding an ancestor within this span
|
||||||
ancestors = token.ancestors
|
ancestors = token.ancestors
|
||||||
for ancestor in ancestors:
|
for ancestor in ancestors:
|
||||||
ancestor_i = ancestor.i - self.c.start
|
ancestor_i = ancestor.i - span_c.start
|
||||||
if ancestor_i in range(length):
|
if ancestor_i in range(length):
|
||||||
array[i, head_col] = ancestor_i - i
|
array[i, head_col] = ancestor_i - i
|
||||||
|
|
||||||
|
@ -326,7 +336,8 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#get_lca_matrix
|
DOCS: https://spacy.io/api/span#get_lca_matrix
|
||||||
"""
|
"""
|
||||||
return numpy.asarray(_get_lca_matrix(self.doc, self.c.start, self.c.end))
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end))
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||||
|
@ -422,6 +433,9 @@ cdef class Span:
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E030)
|
raise ValueError(Errors.E030)
|
||||||
|
|
||||||
|
cdef SpanC* span_c(self):
|
||||||
|
return self.c.get()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sents(self):
|
def sents(self):
|
||||||
"""Obtain the sentences that contain this span. If the given span
|
"""Obtain the sentences that contain this span. If the given span
|
||||||
|
@ -473,10 +487,13 @@ cdef class Span:
|
||||||
DOCS: https://spacy.io/api/span#ents
|
DOCS: https://spacy.io/api/span#ents
|
||||||
"""
|
"""
|
||||||
cdef Span ent
|
cdef Span ent
|
||||||
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
cdef SpanC* ent_span_c
|
||||||
ents = []
|
ents = []
|
||||||
for ent in self.doc.ents:
|
for ent in self.doc.ents:
|
||||||
if ent.c.start >= self.c.start:
|
ent_span_c = ent.span_c()
|
||||||
if ent.c.end <= self.c.end:
|
if ent_span_c.start >= span_c.start:
|
||||||
|
if ent_span_c.end <= span_c.end:
|
||||||
ents.append(ent)
|
ents.append(ent)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
@ -611,11 +628,12 @@ cdef class Span:
|
||||||
# This should probably be called 'head', and the other one called
|
# This should probably be called 'head', and the other one called
|
||||||
# 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
|
# 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
|
||||||
cdef int i
|
cdef int i
|
||||||
|
cdef SpanC* span_c = self.span_c()
|
||||||
# First, we scan through the Span, and check whether there's a word
|
# First, we scan through the Span, and check whether there's a word
|
||||||
# with head==0, i.e. a sentence root. If so, we can return it. The
|
# with head==0, i.e. a sentence root. If so, we can return it. The
|
||||||
# longer the span, the more likely it contains a sentence root, and
|
# longer the span, the more likely it contains a sentence root, and
|
||||||
# in this case we return in linear time.
|
# in this case we return in linear time.
|
||||||
for i in range(self.c.start, self.c.end):
|
for i in range(span_c.start, span_c.end):
|
||||||
if self.doc.c[i].head == 0:
|
if self.doc.c[i].head == 0:
|
||||||
return self.doc[i]
|
return self.doc[i]
|
||||||
# If we don't have a sentence root, we do something that's not so
|
# If we don't have a sentence root, we do something that's not so
|
||||||
|
@ -626,15 +644,15 @@ cdef class Span:
|
||||||
# think this should be okay.
|
# think this should be okay.
|
||||||
cdef int current_best = self.doc.length
|
cdef int current_best = self.doc.length
|
||||||
cdef int root = -1
|
cdef int root = -1
|
||||||
for i in range(self.c.start, self.c.end):
|
for i in range(span_c.start, span_c.end):
|
||||||
if self.c.start <= (i+self.doc.c[i].head) < self.c.end:
|
if span_c.start <= (i+self.doc.c[i].head) < span_c.end:
|
||||||
continue
|
continue
|
||||||
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
|
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
|
||||||
if words_to_root < current_best:
|
if words_to_root < current_best:
|
||||||
current_best = words_to_root
|
current_best = words_to_root
|
||||||
root = i
|
root = i
|
||||||
if root == -1:
|
if root == -1:
|
||||||
return self.doc[self.c.start]
|
return self.doc[span_c.start]
|
||||||
else:
|
else:
|
||||||
return self.doc[root]
|
return self.doc[root]
|
||||||
|
|
||||||
|
@ -650,8 +668,9 @@ cdef class Span:
|
||||||
the span.
|
the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
start_idx += self.c.start_char
|
cdef SpanC* span_c = self.span_c()
|
||||||
end_idx += self.c.start_char
|
start_idx += span_c.start_char
|
||||||
|
end_idx += span_c.start_char
|
||||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
|
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -732,60 +751,62 @@ cdef class Span:
|
||||||
|
|
||||||
property start:
|
property start:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.start
|
return self.span_c().start
|
||||||
|
|
||||||
def __set__(self, int start):
|
def __set__(self, int start):
|
||||||
if start < 0:
|
if start < 0:
|
||||||
raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
|
raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
|
||||||
self.c.start = start
|
self.span_c().start = start
|
||||||
|
|
||||||
property end:
|
property end:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.end
|
return self.span_c().end
|
||||||
|
|
||||||
def __set__(self, int end):
|
def __set__(self, int end):
|
||||||
if end < 0:
|
if end < 0:
|
||||||
raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
|
raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
|
||||||
self.c.end = end
|
self.span_c().end = end
|
||||||
|
|
||||||
property start_char:
|
property start_char:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.start_char
|
return self.span_c().start_char
|
||||||
|
|
||||||
def __set__(self, int start_char):
|
def __set__(self, int start_char):
|
||||||
if start_char < 0:
|
if start_char < 0:
|
||||||
raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
|
raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
|
||||||
self.c.start_char = start_char
|
self.span_c().start_char = start_char
|
||||||
|
|
||||||
property end_char:
|
property end_char:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.end_char
|
return self.span_c().end_char
|
||||||
|
|
||||||
def __set__(self, int end_char):
|
def __set__(self, int end_char):
|
||||||
if end_char < 0:
|
if end_char < 0:
|
||||||
raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
|
raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
|
||||||
self.c.end_char = end_char
|
self.span_c().end_char = end_char
|
||||||
|
|
||||||
property label:
|
property label:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.label
|
return self.span_c().label
|
||||||
|
|
||||||
def __set__(self, attr_t label):
|
def __set__(self, attr_t label):
|
||||||
self.c.label = label
|
self.span_c().label = label
|
||||||
|
|
||||||
property kb_id:
|
property kb_id:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.kb_id
|
return self.span_c().kb_id
|
||||||
|
|
||||||
def __set__(self, attr_t kb_id):
|
def __set__(self, attr_t kb_id):
|
||||||
self.c.kb_id = kb_id
|
self.span_c().kb_id = kb_id
|
||||||
|
|
||||||
property id:
|
property id:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.id
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
return span_c.id
|
||||||
|
|
||||||
def __set__(self, attr_t id):
|
def __set__(self, attr_t id):
|
||||||
self.c.id = id
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
span_c.id = id
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""RETURNS (uint64): The entity ID."""
|
"""RETURNS (uint64): The entity ID."""
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from libcpp.memory cimport shared_ptr
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from ..structs cimport SpanC
|
from ..structs cimport SpanC
|
||||||
|
|
||||||
|
@ -5,6 +6,6 @@ cdef class SpanGroup:
|
||||||
cdef public object _doc_ref
|
cdef public object _doc_ref
|
||||||
cdef public str name
|
cdef public str name
|
||||||
cdef public dict attrs
|
cdef public dict attrs
|
||||||
cdef vector[SpanC] c
|
cdef vector[shared_ptr[SpanC]] c
|
||||||
|
|
||||||
cdef void push_back(self, SpanC span) nogil
|
cdef void push_back(self, const shared_ptr[SpanC] &span)
|
||||||
|
|
|
@ -6,6 +6,7 @@ import srsly
|
||||||
|
|
||||||
from spacy.errors import Errors
|
from spacy.errors import Errors
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
|
from libcpp.memory cimport make_shared
|
||||||
|
|
||||||
|
|
||||||
cdef class SpanGroup:
|
cdef class SpanGroup:
|
||||||
|
@ -187,10 +188,12 @@ cdef class SpanGroup:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spangroup#to_bytes
|
DOCS: https://spacy.io/api/spangroup#to_bytes
|
||||||
"""
|
"""
|
||||||
|
cdef SpanC* span_c
|
||||||
output = {"name": self.name, "attrs": self.attrs, "spans": []}
|
output = {"name": self.name, "attrs": self.attrs, "spans": []}
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.c.size()):
|
for i in range(self.c.size()):
|
||||||
span = self.c[i]
|
span = self.c[i]
|
||||||
|
span_c = span.get()
|
||||||
# The struct.pack here is probably overkill, but it might help if
|
# The struct.pack here is probably overkill, but it might help if
|
||||||
# you're saving tonnes of spans, and it doesn't really add any
|
# you're saving tonnes of spans, and it doesn't really add any
|
||||||
# complexity. We do take care to specify little-endian byte order
|
# complexity. We do take care to specify little-endian byte order
|
||||||
|
@ -202,13 +205,13 @@ cdef class SpanGroup:
|
||||||
# l: int32_t
|
# l: int32_t
|
||||||
output["spans"].append(struct.pack(
|
output["spans"].append(struct.pack(
|
||||||
">QQQllll",
|
">QQQllll",
|
||||||
span.id,
|
span_c.id,
|
||||||
span.kb_id,
|
span_c.kb_id,
|
||||||
span.label,
|
span_c.label,
|
||||||
span.start,
|
span_c.start,
|
||||||
span.end,
|
span_c.end,
|
||||||
span.start_char,
|
span_c.start_char,
|
||||||
span.end_char
|
span_c.end_char
|
||||||
))
|
))
|
||||||
return srsly.msgpack_dumps(output)
|
return srsly.msgpack_dumps(output)
|
||||||
|
|
||||||
|
@ -235,10 +238,10 @@ cdef class SpanGroup:
|
||||||
span.end = items[4]
|
span.end = items[4]
|
||||||
span.start_char = items[5]
|
span.start_char = items[5]
|
||||||
span.end_char = items[6]
|
span.end_char = items[6]
|
||||||
self.c.push_back(span)
|
self.c.push_back(make_shared[SpanC](span))
|
||||||
return self
|
return self
|
||||||
|
|
||||||
cdef void push_back(self, SpanC span) nogil:
|
cdef void push_back(self, const shared_ptr[SpanC] &span):
|
||||||
self.c.push_back(span)
|
self.c.push_back(span)
|
||||||
|
|
||||||
def copy(self) -> SpanGroup:
|
def copy(self) -> SpanGroup:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user