mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
e4820fa667
|
@ -9,6 +9,11 @@ trigger:
|
||||||
exclude:
|
exclude:
|
||||||
- 'website/*'
|
- 'website/*'
|
||||||
- '*.md'
|
- '*.md'
|
||||||
|
pr:
|
||||||
|
paths:
|
||||||
|
exclude:
|
||||||
|
- 'website/*'
|
||||||
|
- '*.md'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "2.2.1"
|
__version__ = "2.2.2.dev1"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
246
spacy/gold.pyx
246
spacy/gold.pyx
|
@ -546,7 +546,7 @@ cdef class GoldParse:
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
|
||||||
heads=None, deps=None, entities=None, make_projective=False,
|
heads=None, deps=None, entities=None, make_projective=False,
|
||||||
cats=None, links=None, **_):
|
cats=None, links=None, **_):
|
||||||
"""Create a GoldParse.
|
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
||||||
|
|
||||||
doc (Doc): The document the annotations refer to.
|
doc (Doc): The document the annotations refer to.
|
||||||
words (iterable): A sequence of unicode word strings.
|
words (iterable): A sequence of unicode word strings.
|
||||||
|
@ -575,138 +575,142 @@ cdef class GoldParse:
|
||||||
negative examples respectively.
|
negative examples respectively.
|
||||||
RETURNS (GoldParse): The newly constructed object.
|
RETURNS (GoldParse): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if words is None:
|
|
||||||
words = [token.text for token in doc]
|
|
||||||
if tags is None:
|
|
||||||
tags = [None for _ in words]
|
|
||||||
if heads is None:
|
|
||||||
heads = [None for _ in words]
|
|
||||||
if deps is None:
|
|
||||||
deps = [None for _ in words]
|
|
||||||
if morphology is None:
|
|
||||||
morphology = [None for _ in words]
|
|
||||||
if entities is None:
|
|
||||||
entities = ["-" for _ in doc]
|
|
||||||
elif len(entities) == 0:
|
|
||||||
entities = ["O" for _ in doc]
|
|
||||||
else:
|
|
||||||
# Translate the None values to '-', to make processing easier.
|
|
||||||
# See Issue #2603
|
|
||||||
entities = [(ent if ent is not None else "-") for ent in entities]
|
|
||||||
if not isinstance(entities[0], basestring):
|
|
||||||
# Assume we have entities specified by character offset.
|
|
||||||
entities = biluo_tags_from_offsets(doc, entities)
|
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.loss = 0
|
self.loss = 0
|
||||||
self.length = len(doc)
|
self.length = len(doc)
|
||||||
|
|
||||||
# These are filled by the tagger/parser/entity recogniser
|
|
||||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
||||||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
||||||
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
|
||||||
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
||||||
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
||||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
|
||||||
|
|
||||||
self.cats = {} if cats is None else dict(cats)
|
self.cats = {} if cats is None else dict(cats)
|
||||||
self.links = links
|
self.links = links
|
||||||
self.words = [None] * len(doc)
|
|
||||||
self.tags = [None] * len(doc)
|
|
||||||
self.heads = [None] * len(doc)
|
|
||||||
self.labels = [None] * len(doc)
|
|
||||||
self.ner = [None] * len(doc)
|
|
||||||
self.morphology = [None] * len(doc)
|
|
||||||
|
|
||||||
# This needs to be done before we align the words
|
# avoid allocating memory if the doc does not contain any tokens
|
||||||
if make_projective and heads is not None and deps is not None:
|
if self.length > 0:
|
||||||
heads, deps = nonproj.projectivize(heads, deps)
|
if words is None:
|
||||||
|
words = [token.text for token in doc]
|
||||||
# Do many-to-one alignment for misaligned tokens.
|
if tags is None:
|
||||||
# If we over-segment, we'll have one gold word that covers a sequence
|
tags = [None for _ in words]
|
||||||
# of predicted words
|
if heads is None:
|
||||||
# If we under-segment, we'll have one predicted word that covers a
|
heads = [None for _ in words]
|
||||||
# sequence of gold words.
|
if deps is None:
|
||||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
deps = [None for _ in words]
|
||||||
# a sequence of gold words. That's many-to-many -- we don't do that.
|
if morphology is None:
|
||||||
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
morphology = [None for _ in words]
|
||||||
|
if entities is None:
|
||||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
entities = ["-" for _ in doc]
|
||||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
elif len(entities) == 0:
|
||||||
|
entities = ["O" for _ in doc]
|
||||||
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
|
||||||
self.orig_annot = list(zip(*annot_tuples))
|
|
||||||
|
|
||||||
for i, gold_i in enumerate(self.cand_to_gold):
|
|
||||||
if doc[i].text.isspace():
|
|
||||||
self.words[i] = doc[i].text
|
|
||||||
self.tags[i] = "_SP"
|
|
||||||
self.heads[i] = None
|
|
||||||
self.labels[i] = None
|
|
||||||
self.ner[i] = None
|
|
||||||
self.morphology[i] = set()
|
|
||||||
if gold_i is None:
|
|
||||||
if i in i2j_multi:
|
|
||||||
self.words[i] = words[i2j_multi[i]]
|
|
||||||
self.tags[i] = tags[i2j_multi[i]]
|
|
||||||
self.morphology[i] = morphology[i2j_multi[i]]
|
|
||||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
|
||||||
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
|
||||||
# Set next word in multi-token span as head, until last
|
|
||||||
if not is_last:
|
|
||||||
self.heads[i] = i+1
|
|
||||||
self.labels[i] = "subtok"
|
|
||||||
else:
|
|
||||||
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
|
|
||||||
self.labels[i] = deps[i2j_multi[i]]
|
|
||||||
# Now set NER...This is annoying because if we've split
|
|
||||||
# got an entity word split into two, we need to adjust the
|
|
||||||
# BILUO tags. We can't have BB or LL etc.
|
|
||||||
# Case 1: O -- easy.
|
|
||||||
ner_tag = entities[i2j_multi[i]]
|
|
||||||
if ner_tag == "O":
|
|
||||||
self.ner[i] = "O"
|
|
||||||
# Case 2: U. This has to become a B I* L sequence.
|
|
||||||
elif ner_tag.startswith("U-"):
|
|
||||||
if is_first:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "B-", 1)
|
|
||||||
elif is_last:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "L-", 1)
|
|
||||||
else:
|
|
||||||
self.ner[i] = ner_tag.replace("U-", "I-", 1)
|
|
||||||
# Case 3: L. If not last, change to I.
|
|
||||||
elif ner_tag.startswith("L-"):
|
|
||||||
if is_last:
|
|
||||||
self.ner[i] = ner_tag
|
|
||||||
else:
|
|
||||||
self.ner[i] = ner_tag.replace("L-", "I-", 1)
|
|
||||||
# Case 4: I. Stays correct
|
|
||||||
elif ner_tag.startswith("I-"):
|
|
||||||
self.ner[i] = ner_tag
|
|
||||||
else:
|
else:
|
||||||
self.words[i] = words[gold_i]
|
# Translate the None values to '-', to make processing easier.
|
||||||
self.tags[i] = tags[gold_i]
|
# See Issue #2603
|
||||||
self.morphology[i] = morphology[gold_i]
|
entities = [(ent if ent is not None else "-") for ent in entities]
|
||||||
if heads[gold_i] is None:
|
if not isinstance(entities[0], basestring):
|
||||||
|
# Assume we have entities specified by character offset.
|
||||||
|
entities = biluo_tags_from_offsets(doc, entities)
|
||||||
|
|
||||||
|
# These are filled by the tagger/parser/entity recogniser
|
||||||
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||||
|
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||||
|
|
||||||
|
self.words = [None] * len(doc)
|
||||||
|
self.tags = [None] * len(doc)
|
||||||
|
self.heads = [None] * len(doc)
|
||||||
|
self.labels = [None] * len(doc)
|
||||||
|
self.ner = [None] * len(doc)
|
||||||
|
self.morphology = [None] * len(doc)
|
||||||
|
|
||||||
|
# This needs to be done before we align the words
|
||||||
|
if make_projective and heads is not None and deps is not None:
|
||||||
|
heads, deps = nonproj.projectivize(heads, deps)
|
||||||
|
|
||||||
|
# Do many-to-one alignment for misaligned tokens.
|
||||||
|
# If we over-segment, we'll have one gold word that covers a sequence
|
||||||
|
# of predicted words
|
||||||
|
# If we under-segment, we'll have one predicted word that covers a
|
||||||
|
# sequence of gold words.
|
||||||
|
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||||
|
# a sequence of gold words. That's many-to-many -- we don't do that.
|
||||||
|
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
||||||
|
|
||||||
|
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||||
|
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||||
|
|
||||||
|
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
||||||
|
self.orig_annot = list(zip(*annot_tuples))
|
||||||
|
|
||||||
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
|
if doc[i].text.isspace():
|
||||||
|
self.words[i] = doc[i].text
|
||||||
|
self.tags[i] = "_SP"
|
||||||
self.heads[i] = None
|
self.heads[i] = None
|
||||||
|
self.labels[i] = None
|
||||||
|
self.ner[i] = None
|
||||||
|
self.morphology[i] = set()
|
||||||
|
if gold_i is None:
|
||||||
|
if i in i2j_multi:
|
||||||
|
self.words[i] = words[i2j_multi[i]]
|
||||||
|
self.tags[i] = tags[i2j_multi[i]]
|
||||||
|
self.morphology[i] = morphology[i2j_multi[i]]
|
||||||
|
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||||
|
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
||||||
|
# Set next word in multi-token span as head, until last
|
||||||
|
if not is_last:
|
||||||
|
self.heads[i] = i+1
|
||||||
|
self.labels[i] = "subtok"
|
||||||
|
else:
|
||||||
|
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
|
||||||
|
self.labels[i] = deps[i2j_multi[i]]
|
||||||
|
# Now set NER...This is annoying because if we've split
|
||||||
|
# got an entity word split into two, we need to adjust the
|
||||||
|
# BILUO tags. We can't have BB or LL etc.
|
||||||
|
# Case 1: O -- easy.
|
||||||
|
ner_tag = entities[i2j_multi[i]]
|
||||||
|
if ner_tag == "O":
|
||||||
|
self.ner[i] = "O"
|
||||||
|
# Case 2: U. This has to become a B I* L sequence.
|
||||||
|
elif ner_tag.startswith("U-"):
|
||||||
|
if is_first:
|
||||||
|
self.ner[i] = ner_tag.replace("U-", "B-", 1)
|
||||||
|
elif is_last:
|
||||||
|
self.ner[i] = ner_tag.replace("U-", "L-", 1)
|
||||||
|
else:
|
||||||
|
self.ner[i] = ner_tag.replace("U-", "I-", 1)
|
||||||
|
# Case 3: L. If not last, change to I.
|
||||||
|
elif ner_tag.startswith("L-"):
|
||||||
|
if is_last:
|
||||||
|
self.ner[i] = ner_tag
|
||||||
|
else:
|
||||||
|
self.ner[i] = ner_tag.replace("L-", "I-", 1)
|
||||||
|
# Case 4: I. Stays correct
|
||||||
|
elif ner_tag.startswith("I-"):
|
||||||
|
self.ner[i] = ner_tag
|
||||||
else:
|
else:
|
||||||
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
self.words[i] = words[gold_i]
|
||||||
self.labels[i] = deps[gold_i]
|
self.tags[i] = tags[gold_i]
|
||||||
self.ner[i] = entities[gold_i]
|
self.morphology[i] = morphology[gold_i]
|
||||||
|
if heads[gold_i] is None:
|
||||||
|
self.heads[i] = None
|
||||||
|
else:
|
||||||
|
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||||
|
self.labels[i] = deps[gold_i]
|
||||||
|
self.ner[i] = entities[gold_i]
|
||||||
|
|
||||||
# Prevent whitespace that isn't within entities from being tagged as
|
# Prevent whitespace that isn't within entities from being tagged as
|
||||||
# an entity.
|
# an entity.
|
||||||
for i in range(len(self.ner)):
|
for i in range(len(self.ner)):
|
||||||
if self.tags[i] == "_SP":
|
if self.tags[i] == "_SP":
|
||||||
prev_ner = self.ner[i-1] if i >= 1 else None
|
prev_ner = self.ner[i-1] if i >= 1 else None
|
||||||
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
|
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
|
||||||
if prev_ner == "O" or next_ner == "O":
|
if prev_ner == "O" or next_ner == "O":
|
||||||
self.ner[i] = "O"
|
self.ner[i] = "O"
|
||||||
|
|
||||||
cycle = nonproj.contains_cycle(self.heads)
|
cycle = nonproj.contains_cycle(self.heads)
|
||||||
if cycle is not None:
|
if cycle is not None:
|
||||||
raise ValueError(Errors.E069.format(cycle=cycle,
|
raise ValueError(Errors.E069.format(cycle=cycle,
|
||||||
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
|
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
|
||||||
doc_tokens=" ".join(words[:50])))
|
doc_tokens=" ".join(words[:50])))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""Get the number of gold-standard tokens.
|
"""Get the number of gold-standard tokens.
|
||||||
|
|
|
@ -254,7 +254,12 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
|
||||||
cdef PatternStateC state
|
cdef PatternStateC state
|
||||||
cdef int i, j, nr_extra_attr
|
cdef int i, j, nr_extra_attr
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
|
output = []
|
||||||
|
if doc.length == 0:
|
||||||
|
# avoid any processing or mem alloc if the document is empty
|
||||||
|
return output
|
||||||
|
if len(predicates) > 0:
|
||||||
|
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
|
||||||
if extensions is not None and len(extensions) >= 1:
|
if extensions is not None and len(extensions) >= 1:
|
||||||
nr_extra_attr = max(extensions.values()) + 1
|
nr_extra_attr = max(extensions.values()) + 1
|
||||||
extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t))
|
extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t))
|
||||||
|
@ -278,7 +283,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
|
||||||
predicate_cache += len(predicates)
|
predicate_cache += len(predicates)
|
||||||
# Handle matches that end in 0-width patterns
|
# Handle matches that end in 0-width patterns
|
||||||
finish_states(matches, states)
|
finish_states(matches, states)
|
||||||
output = []
|
|
||||||
seen = set()
|
seen = set()
|
||||||
for i in range(matches.size()):
|
for i in range(matches.size()):
|
||||||
match = (
|
match = (
|
||||||
|
@ -560,12 +564,14 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
||||||
for j, (attr, value) in enumerate(spec):
|
for j, (attr, value) in enumerate(spec):
|
||||||
pattern[i].attrs[j].attr = attr
|
pattern[i].attrs[j].attr = attr
|
||||||
pattern[i].attrs[j].value = value
|
pattern[i].attrs[j].value = value
|
||||||
pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
|
if len(extensions) > 0:
|
||||||
|
pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
|
||||||
for j, (index, value) in enumerate(extensions):
|
for j, (index, value) in enumerate(extensions):
|
||||||
pattern[i].extra_attrs[j].index = index
|
pattern[i].extra_attrs[j].index = index
|
||||||
pattern[i].extra_attrs[j].value = value
|
pattern[i].extra_attrs[j].value = value
|
||||||
pattern[i].nr_extra_attr = len(extensions)
|
pattern[i].nr_extra_attr = len(extensions)
|
||||||
pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
|
if len(predicates) > 0:
|
||||||
|
pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
|
||||||
for j, index in enumerate(predicates):
|
for j, index in enumerate(predicates):
|
||||||
pattern[i].py_predicates[j] = index
|
pattern[i].py_predicates[j] = index
|
||||||
pattern[i].nr_py = len(predicates)
|
pattern[i].nr_py = len(predicates)
|
||||||
|
|
|
@ -36,7 +36,9 @@ cdef WeightsC get_c_weights(model) except *
|
||||||
|
|
||||||
cdef SizesC get_c_sizes(model, int batch_size) except *
|
cdef SizesC get_c_sizes(model, int batch_size) except *
|
||||||
|
|
||||||
cdef void resize_activations(ActivationsC* A, SizesC n) nogil
|
cdef ActivationsC alloc_activations(SizesC n) nogil
|
||||||
|
|
||||||
|
cdef void free_activations(const ActivationsC* A) nogil
|
||||||
|
|
||||||
cdef void predict_states(ActivationsC* A, StateC** states,
|
cdef void predict_states(ActivationsC* A, StateC** states,
|
||||||
const WeightsC* W, SizesC n) nogil
|
const WeightsC* W, SizesC n) nogil
|
||||||
|
|
|
@ -62,6 +62,21 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
cdef ActivationsC alloc_activations(SizesC n) nogil:
|
||||||
|
cdef ActivationsC A
|
||||||
|
memset(&A, 0, sizeof(A))
|
||||||
|
resize_activations(&A, n)
|
||||||
|
return A
|
||||||
|
|
||||||
|
|
||||||
|
cdef void free_activations(const ActivationsC* A) nogil:
|
||||||
|
free(A.token_ids)
|
||||||
|
free(A.scores)
|
||||||
|
free(A.unmaxed)
|
||||||
|
free(A.hiddens)
|
||||||
|
free(A.is_valid)
|
||||||
|
|
||||||
|
|
||||||
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||||
if n.states <= A._max_size:
|
if n.states <= A._max_size:
|
||||||
A._curr_size = n.states
|
A._curr_size = n.states
|
||||||
|
|
|
@ -27,7 +27,8 @@ from thinc.neural.util import get_array_module
|
||||||
from thinc.linalg cimport Vec, VecVec
|
from thinc.linalg cimport Vec, VecVec
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
|
from ._parser_model cimport alloc_activations, free_activations
|
||||||
|
from ._parser_model cimport predict_states, arg_max_if_valid
|
||||||
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
from ._parser_model cimport get_c_weights, get_c_sizes
|
from ._parser_model cimport get_c_weights, get_c_sizes
|
||||||
from ._parser_model import ParserModel
|
from ._parser_model import ParserModel
|
||||||
|
@ -312,8 +313,7 @@ cdef class Parser:
|
||||||
WeightsC weights, SizesC sizes) nogil:
|
WeightsC weights, SizesC sizes) nogil:
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef vector[StateC*] unfinished
|
cdef vector[StateC*] unfinished
|
||||||
cdef ActivationsC activations
|
cdef ActivationsC activations = alloc_activations(sizes)
|
||||||
memset(&activations, 0, sizeof(activations))
|
|
||||||
while sizes.states >= 1:
|
while sizes.states >= 1:
|
||||||
predict_states(&activations,
|
predict_states(&activations,
|
||||||
states, &weights, sizes)
|
states, &weights, sizes)
|
||||||
|
@ -327,6 +327,7 @@ cdef class Parser:
|
||||||
states[i] = unfinished[i]
|
states[i] = unfinished[i]
|
||||||
sizes.states = unfinished.size()
|
sizes.states = unfinished.size()
|
||||||
unfinished.clear()
|
unfinished.clear()
|
||||||
|
free_activations(&activations)
|
||||||
|
|
||||||
def set_annotations(self, docs, states_or_beams, tensors=None):
|
def set_annotations(self, docs, states_or_beams, tensors=None):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
|
@ -363,6 +364,9 @@ cdef class Parser:
|
||||||
|
|
||||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||||
int nr_class, int batch_size) nogil:
|
int nr_class, int batch_size) nogil:
|
||||||
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||||
|
with gil:
|
||||||
|
assert self.moves.n_moves > 0
|
||||||
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
|
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
|
||||||
cdef int i, guess
|
cdef int i, guess
|
||||||
cdef Transition action
|
cdef Transition action
|
||||||
|
@ -546,6 +550,10 @@ cdef class Parser:
|
||||||
cdef GoldParse gold
|
cdef GoldParse gold
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
||||||
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||||
|
assert self.moves.n_moves > 0
|
||||||
|
|
||||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
||||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
||||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
||||||
|
|
|
@ -83,6 +83,8 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
def get_oracle_sequence(self, doc, GoldParse gold):
|
def get_oracle_sequence(self, doc, GoldParse gold):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||||
|
assert self.n_moves > 0
|
||||||
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
|
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
|
||||||
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
|
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
|
||||||
|
|
||||||
|
|
|
@ -141,6 +141,18 @@ def test_vectors_most_similar(most_similar_vectors_data):
|
||||||
assert all(row[0] == i for i, row in enumerate(best_rows))
|
assert all(row[0] == i for i, row in enumerate(best_rows))
|
||||||
|
|
||||||
|
|
||||||
|
def test_vectors_most_similar_identical():
|
||||||
|
"""Test that most similar identical vectors are assigned a score of 1.0."""
|
||||||
|
data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
|
||||||
|
v = Vectors(data=data, keys=["A", "B", "C"])
|
||||||
|
keys, _, scores = v.most_similar(numpy.asarray([[4, 2, 2, 2]], dtype="f"))
|
||||||
|
assert scores[0][0] == 1.0 # not 1.0000002
|
||||||
|
data = numpy.asarray([[1, 2, 3], [1, 2, 3], [1, 1, 1]], dtype="f")
|
||||||
|
v = Vectors(data=data, keys=["A", "B", "C"])
|
||||||
|
keys, _, scores = v.most_similar(numpy.asarray([[1, 2, 3]], dtype="f"))
|
||||||
|
assert scores[0][0] == 1.0 # not 0.9999999
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", ["apple and orange"])
|
@pytest.mark.parametrize("text", ["apple and orange"])
|
||||||
def test_vectors_token_vector(tokenizer_v, vectors, text):
|
def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||||
doc = tokenizer_v(text)
|
doc = tokenizer_v(text)
|
||||||
|
@ -302,4 +314,4 @@ def test_vocab_prune_vectors():
|
||||||
assert list(remap.keys()) == ["kitten"]
|
assert list(remap.keys()) == ["kitten"]
|
||||||
neighbour, similarity = list(remap.values())[0]
|
neighbour, similarity = list(remap.values())[0]
|
||||||
assert neighbour == "cat", remap
|
assert neighbour == "cat", remap
|
||||||
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
|
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
|
||||||
|
|
|
@ -331,6 +331,9 @@ cdef class Tokenizer:
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
||||||
int has_special, int n) except -1:
|
int has_special, int n) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
|
if n <= 0:
|
||||||
|
# avoid mem alloc of zero length
|
||||||
|
return 0
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -157,6 +157,9 @@ def _merge(Doc doc, merges):
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int merged_iob = 0
|
cdef int merged_iob = 0
|
||||||
|
|
||||||
|
# merges should not be empty, but make sure to avoid zero-length mem alloc
|
||||||
|
assert len(merges) > 0
|
||||||
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
||||||
spans = []
|
spans = []
|
||||||
|
|
||||||
|
|
|
@ -791,6 +791,8 @@ cdef class Doc:
|
||||||
# Get set up for fast loading
|
# Get set up for fast loading
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int n_attrs = len(attrs)
|
cdef int n_attrs = len(attrs)
|
||||||
|
# attrs should not be empty, but make sure to avoid zero-length mem alloc
|
||||||
|
assert n_attrs > 0
|
||||||
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
|
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
|
||||||
for i, attr_id in enumerate(attrs):
|
for i, attr_id in enumerate(attrs):
|
||||||
attr_ids[i] = attr_id
|
attr_ids[i] = attr_id
|
||||||
|
|
|
@ -344,8 +344,12 @@ cdef class Vectors:
|
||||||
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
|
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
|
||||||
scores[i:i+batch_size] = scores[sorted_index]
|
scores[i:i+batch_size] = scores[sorted_index]
|
||||||
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
||||||
|
|
||||||
xp = get_array_module(self.data)
|
xp = get_array_module(self.data)
|
||||||
|
# Round values really close to 1 or -1
|
||||||
|
scores = xp.around(scores, decimals=4, out=scores)
|
||||||
|
# Account for numerical error we want to return in range -1, 1
|
||||||
|
scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
|
||||||
row2key = {row: key for key, row in self.key2row.items()}
|
row2key = {row: key for key, row in self.key2row.items()}
|
||||||
keys = xp.asarray(
|
keys = xp.asarray(
|
||||||
[[row2key[row] for row in best_rows[i] if row in row2key]
|
[[row2key[row] for row in best_rows[i] if row in row2key]
|
||||||
|
|
|
@ -1135,6 +1135,8 @@ def expand_person_entities(doc):
|
||||||
if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
|
if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
|
||||||
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
|
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
|
||||||
new_ents.append(new_ent)
|
new_ents.append(new_ent)
|
||||||
|
else:
|
||||||
|
new_ents.append(ent)
|
||||||
else:
|
else:
|
||||||
new_ents.append(ent)
|
new_ents.append(ent)
|
||||||
doc.ents = new_ents
|
doc.ents = new_ents
|
||||||
|
|
Loading…
Reference in New Issue
Block a user