Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2018-05-20 18:59:24 +02:00
commit bdc23dd8c1
6 changed files with 54 additions and 19 deletions

View File

@ -245,6 +245,8 @@ class Errors(object):
"the meta.json. Vector names are required to avoid issue #1660.") "the meta.json. Vector names are required to avoid issue #1660.")
E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}") E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
E094 = ("Error reading line {line_num} in vectors file {loc}.") E094 = ("Error reading line {line_num} in vectors file {loc}.")
E095 = ("Can't write to frozen dictionary. This is likely an internal "
"error. Are you writing to a default function argument?")
@add_codes @add_codes

View File

@ -19,12 +19,10 @@ cdef struct WeightsC:
const float* feat_bias const float* feat_bias
const float* hidden_bias const float* hidden_bias
const float* hidden_weights const float* hidden_weights
const float* vectors
cdef struct ActivationsC: cdef struct ActivationsC:
int* token_ids int* token_ids
float* vectors
float* unmaxed float* unmaxed
float* scores float* scores
float* hiddens float* hiddens

View File

@ -50,8 +50,6 @@ cdef WeightsC get_c_weights(model) except *:
cdef np.ndarray vec2scores_b = model.vec2scores.b cdef np.ndarray vec2scores_b = model.vec2scores.b
output.hidden_weights = <const float*>vec2scores_W.data output.hidden_weights = <const float*>vec2scores_W.data
output.hidden_bias = <const float*>vec2scores_b.data output.hidden_bias = <const float*>vec2scores_b.data
cdef np.ndarray tokvecs = model.tokvecs
output.vectors = <float*>tokvecs.data
return output return output
@ -72,7 +70,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
return return
if A._max_size == 0: if A._max_size == 0:
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0])) A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
A.vectors = <float*>calloc(n.states * n.embed_width, sizeof(A.vectors[0]))
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0])) A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0])) A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0])) A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
@ -81,8 +78,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
else: else:
A.token_ids = <int*>realloc(A.token_ids, A.token_ids = <int*>realloc(A.token_ids,
n.states * n.feats * sizeof(A.token_ids[0])) n.states * n.feats * sizeof(A.token_ids[0]))
A.vectors = <float*>realloc(A.vectors,
n.states * n.embed_width * sizeof(A.vectors[0]))
A.scores = <float*>realloc(A.scores, A.scores = <float*>realloc(A.scores,
n.states * n.classes * sizeof(A.scores[0])) n.states * n.classes * sizeof(A.scores[0]))
A.unmaxed = <float*>realloc(A.unmaxed, A.unmaxed = <float*>realloc(A.unmaxed,
@ -242,7 +237,7 @@ class ParserStepModel(Model):
def begin_update(self, states, drop=0.): def begin_update(self, states, drop=0.):
token_ids = self.get_token_ids(states) token_ids = self.get_token_ids(states)
vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0) vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
mask = self.ops.get_dropout_mask(vector.shape, drop) mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
if mask is not None: if mask is not None:
vector *= mask vector *= mask
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
@ -251,7 +246,7 @@ class ParserStepModel(Model):
d_vector = get_d_vector(d_scores, sgd=sgd) d_vector = get_d_vector(d_scores, sgd=sgd)
if mask is not None: if mask is not None:
d_vector *= mask d_vector *= mask
if isinstance(self.ops, CupyOps) \ if isinstance(self.state2vec.ops, CupyOps) \
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously # Move token_ids and d_vector to GPU, asynchronously
self.backprops.append(( self.backprops.append((

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from ..util import get_doc from ..util import get_doc
from ...tokens import Doc from ...tokens import Doc
from ...vocab import Vocab from ...vocab import Vocab
from ...attrs import LEMMA
import pytest import pytest
import numpy import numpy
@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer):
doc.merge(8, 32, tag='', lemma='', ent_type='ORG') doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
def test_doc_api_retokenizer(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7])
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
def test_doc_api_retokenizer_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
# test both string and integer attributes and values
attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
assert doc[4].lemma_ == 'boys'
assert doc[4].ent_type_ == 'ORG'
def test_doc_api_sents_empty_string(en_tokenizer): def test_doc_api_sents_empty_string(en_tokenizer):
doc = en_tokenizer("") doc = en_tokenizer("")
doc.is_parsed = True doc.is_parsed = True

View File

@ -11,11 +11,13 @@ from .span cimport Span
from .token cimport Token from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..structs cimport LexemeC, TokenC from ..structs cimport LexemeC, TokenC
from ..attrs cimport * from ..attrs cimport TAG
from ..attrs import intify_attrs
from ..util import SimpleFrozenDict
cdef class Retokenizer: cdef class Retokenizer:
'''Helper class for doc.retokenize() context manager.''' """Helper class for doc.retokenize() context manager."""
cdef Doc doc cdef Doc doc
cdef list merges cdef list merges
cdef list splits cdef list splits
@ -24,14 +26,18 @@ cdef class Retokenizer:
self.merges = [] self.merges = []
self.splits = [] self.splits = []
def merge(self, Span span, attrs=None): def merge(self, Span span, attrs=SimpleFrozenDict()):
'''Mark a span for merging. The attrs will be applied to the resulting """Mark a span for merging. The attrs will be applied to the resulting
token.''' token.
"""
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.merges.append((span.start_char, span.end_char, attrs)) self.merges.append((span.start_char, span.end_char, attrs))
def split(self, Token token, orths, attrs=None): def split(self, Token token, orths, attrs=SimpleFrozenDict()):
'''Mark a Token for splitting, into the specified orths. The attrs """Mark a Token for splitting, into the specified orths. The attrs
will be applied to each subtoken.''' will be applied to each subtoken.
"""
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.splits.append((token.start_char, orths, attrs)) self.splits.append((token.start_char, orths, attrs))
def __enter__(self): def __enter__(self):
@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes):
# Clear the cached Python objects # Clear the cached Python objects
# Return the merged Python object # Return the merged Python object
return doc[start] return doc[start]

View File

@ -635,3 +635,18 @@ def use_gpu(gpu_id):
def fix_random_seed(seed=0): def fix_random_seed(seed=0):
random.seed(seed) random.seed(seed)
numpy.random.seed(seed) numpy.random.seed(seed)
class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty
dictionary). Will raise an error if user or spaCy attempts to add to dict.
"""
def __setitem__(self, key, value):
raise NotImplementedError(Errors.E095)
def pop(self, key, default=None):
raise NotImplementedError(Errors.E095)
def update(self, other):
raise NotImplementedError(Errors.E095)