mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
bdc23dd8c1
|
@ -245,6 +245,8 @@ class Errors(object):
|
||||||
"the meta.json. Vector names are required to avoid issue #1660.")
|
"the meta.json. Vector names are required to avoid issue #1660.")
|
||||||
E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
|
E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
|
||||||
E094 = ("Error reading line {line_num} in vectors file {loc}.")
|
E094 = ("Error reading line {line_num} in vectors file {loc}.")
|
||||||
|
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
||||||
|
"error. Are you writing to a default function argument?")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -19,12 +19,10 @@ cdef struct WeightsC:
|
||||||
const float* feat_bias
|
const float* feat_bias
|
||||||
const float* hidden_bias
|
const float* hidden_bias
|
||||||
const float* hidden_weights
|
const float* hidden_weights
|
||||||
const float* vectors
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct ActivationsC:
|
cdef struct ActivationsC:
|
||||||
int* token_ids
|
int* token_ids
|
||||||
float* vectors
|
|
||||||
float* unmaxed
|
float* unmaxed
|
||||||
float* scores
|
float* scores
|
||||||
float* hiddens
|
float* hiddens
|
||||||
|
|
|
@ -50,8 +50,6 @@ cdef WeightsC get_c_weights(model) except *:
|
||||||
cdef np.ndarray vec2scores_b = model.vec2scores.b
|
cdef np.ndarray vec2scores_b = model.vec2scores.b
|
||||||
output.hidden_weights = <const float*>vec2scores_W.data
|
output.hidden_weights = <const float*>vec2scores_W.data
|
||||||
output.hidden_bias = <const float*>vec2scores_b.data
|
output.hidden_bias = <const float*>vec2scores_b.data
|
||||||
cdef np.ndarray tokvecs = model.tokvecs
|
|
||||||
output.vectors = <float*>tokvecs.data
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,7 +70,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||||
return
|
return
|
||||||
if A._max_size == 0:
|
if A._max_size == 0:
|
||||||
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||||||
A.vectors = <float*>calloc(n.states * n.embed_width, sizeof(A.vectors[0]))
|
|
||||||
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
||||||
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||||||
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||||||
|
@ -81,8 +78,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||||
else:
|
else:
|
||||||
A.token_ids = <int*>realloc(A.token_ids,
|
A.token_ids = <int*>realloc(A.token_ids,
|
||||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||||
A.vectors = <float*>realloc(A.vectors,
|
|
||||||
n.states * n.embed_width * sizeof(A.vectors[0]))
|
|
||||||
A.scores = <float*>realloc(A.scores,
|
A.scores = <float*>realloc(A.scores,
|
||||||
n.states * n.classes * sizeof(A.scores[0]))
|
n.states * n.classes * sizeof(A.scores[0]))
|
||||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||||
|
@ -242,7 +237,7 @@ class ParserStepModel(Model):
|
||||||
def begin_update(self, states, drop=0.):
|
def begin_update(self, states, drop=0.):
|
||||||
token_ids = self.get_token_ids(states)
|
token_ids = self.get_token_ids(states)
|
||||||
vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
|
vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
|
||||||
mask = self.ops.get_dropout_mask(vector.shape, drop)
|
mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
vector *= mask
|
vector *= mask
|
||||||
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
|
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
|
||||||
|
@ -251,7 +246,7 @@ class ParserStepModel(Model):
|
||||||
d_vector = get_d_vector(d_scores, sgd=sgd)
|
d_vector = get_d_vector(d_scores, sgd=sgd)
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
d_vector *= mask
|
d_vector *= mask
|
||||||
if isinstance(self.ops, CupyOps) \
|
if isinstance(self.state2vec.ops, CupyOps) \
|
||||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
self.backprops.append((
|
self.backprops.append((
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
from ...attrs import LEMMA
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer):
|
||||||
doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
|
doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_api_retokenizer(en_tokenizer):
|
||||||
|
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[4:7])
|
||||||
|
assert len(doc) == 7
|
||||||
|
assert doc[4].text == 'the beach boys'
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_api_retokenizer_attrs(en_tokenizer):
|
||||||
|
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
||||||
|
# test both string and integer attributes and values
|
||||||
|
attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||||
|
assert len(doc) == 7
|
||||||
|
assert doc[4].text == 'the beach boys'
|
||||||
|
assert doc[4].lemma_ == 'boys'
|
||||||
|
assert doc[4].ent_type_ == 'ORG'
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_sents_empty_string(en_tokenizer):
|
def test_doc_api_sents_empty_string(en_tokenizer):
|
||||||
doc = en_tokenizer("")
|
doc = en_tokenizer("")
|
||||||
doc.is_parsed = True
|
doc.is_parsed = True
|
||||||
|
|
|
@ -11,11 +11,13 @@ from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..structs cimport LexemeC, TokenC
|
from ..structs cimport LexemeC, TokenC
|
||||||
from ..attrs cimport *
|
from ..attrs cimport TAG
|
||||||
|
from ..attrs import intify_attrs
|
||||||
|
from ..util import SimpleFrozenDict
|
||||||
|
|
||||||
|
|
||||||
cdef class Retokenizer:
|
cdef class Retokenizer:
|
||||||
'''Helper class for doc.retokenize() context manager.'''
|
"""Helper class for doc.retokenize() context manager."""
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef list merges
|
cdef list merges
|
||||||
cdef list splits
|
cdef list splits
|
||||||
|
@ -24,14 +26,18 @@ cdef class Retokenizer:
|
||||||
self.merges = []
|
self.merges = []
|
||||||
self.splits = []
|
self.splits = []
|
||||||
|
|
||||||
def merge(self, Span span, attrs=None):
|
def merge(self, Span span, attrs=SimpleFrozenDict()):
|
||||||
'''Mark a span for merging. The attrs will be applied to the resulting
|
"""Mark a span for merging. The attrs will be applied to the resulting
|
||||||
token.'''
|
token.
|
||||||
|
"""
|
||||||
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
self.merges.append((span.start_char, span.end_char, attrs))
|
self.merges.append((span.start_char, span.end_char, attrs))
|
||||||
|
|
||||||
def split(self, Token token, orths, attrs=None):
|
def split(self, Token token, orths, attrs=SimpleFrozenDict()):
|
||||||
'''Mark a Token for splitting, into the specified orths. The attrs
|
"""Mark a Token for splitting, into the specified orths. The attrs
|
||||||
will be applied to each subtoken.'''
|
will be applied to each subtoken.
|
||||||
|
"""
|
||||||
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
self.splits.append((token.start_char, orths, attrs))
|
self.splits.append((token.start_char, orths, attrs))
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
|
@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes):
|
||||||
# Clear the cached Python objects
|
# Clear the cached Python objects
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return doc[start]
|
return doc[start]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -635,3 +635,18 @@ def use_gpu(gpu_id):
|
||||||
def fix_random_seed(seed=0):
|
def fix_random_seed(seed=0):
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
numpy.random.seed(seed)
|
numpy.random.seed(seed)
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleFrozenDict(dict):
|
||||||
|
"""Simplified implementation of a frozen dict, mainly used as default
|
||||||
|
function or method argument (for arguments that should default to empty
|
||||||
|
dictionary). Will raise an error if user or spaCy attempts to add to dict.
|
||||||
|
"""
|
||||||
|
def __setitem__(self, key, value):
|
||||||
|
raise NotImplementedError(Errors.E095)
|
||||||
|
|
||||||
|
def pop(self, key, default=None):
|
||||||
|
raise NotImplementedError(Errors.E095)
|
||||||
|
|
||||||
|
def update(self, other):
|
||||||
|
raise NotImplementedError(Errors.E095)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user