Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2018-05-20 18:59:24 +02:00
commit bdc23dd8c1
6 changed files with 54 additions and 19 deletions

View File

@ -245,6 +245,8 @@ class Errors(object):
"the meta.json. Vector names are required to avoid issue #1660.")
E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
E094 = ("Error reading line {line_num} in vectors file {loc}.")
E095 = ("Can't write to frozen dictionary. This is likely an internal "
"error. Are you writing to a default function argument?")
@add_codes

View File

@ -19,12 +19,10 @@ cdef struct WeightsC:
const float* feat_bias
const float* hidden_bias
const float* hidden_weights
const float* vectors
cdef struct ActivationsC:
int* token_ids
float* vectors
float* unmaxed
float* scores
float* hiddens

View File

@ -50,8 +50,6 @@ cdef WeightsC get_c_weights(model) except *:
cdef np.ndarray vec2scores_b = model.vec2scores.b
output.hidden_weights = <const float*>vec2scores_W.data
output.hidden_bias = <const float*>vec2scores_b.data
cdef np.ndarray tokvecs = model.tokvecs
output.vectors = <float*>tokvecs.data
return output
@ -72,7 +70,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
return
if A._max_size == 0:
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
A.vectors = <float*>calloc(n.states * n.embed_width, sizeof(A.vectors[0]))
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
@ -81,8 +78,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
else:
A.token_ids = <int*>realloc(A.token_ids,
n.states * n.feats * sizeof(A.token_ids[0]))
A.vectors = <float*>realloc(A.vectors,
n.states * n.embed_width * sizeof(A.vectors[0]))
A.scores = <float*>realloc(A.scores,
n.states * n.classes * sizeof(A.scores[0]))
A.unmaxed = <float*>realloc(A.unmaxed,
@ -242,7 +237,7 @@ class ParserStepModel(Model):
def begin_update(self, states, drop=0.):
token_ids = self.get_token_ids(states)
vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
mask = self.ops.get_dropout_mask(vector.shape, drop)
mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
if mask is not None:
vector *= mask
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
@ -251,7 +246,7 @@ class ParserStepModel(Model):
d_vector = get_d_vector(d_scores, sgd=sgd)
if mask is not None:
d_vector *= mask
if isinstance(self.ops, CupyOps) \
if isinstance(self.state2vec.ops, CupyOps) \
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
self.backprops.append((

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from ..util import get_doc
from ...tokens import Doc
from ...vocab import Vocab
from ...attrs import LEMMA
import pytest
import numpy
@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer):
doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
def test_doc_api_retokenizer(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7])
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
def test_doc_api_retokenizer_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
# test both string and integer attributes and values
attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
assert doc[4].lemma_ == 'boys'
assert doc[4].ent_type_ == 'ORG'
def test_doc_api_sents_empty_string(en_tokenizer):
doc = en_tokenizer("")
doc.is_parsed = True

View File

@ -11,11 +11,13 @@ from .span cimport Span
from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..structs cimport LexemeC, TokenC
from ..attrs cimport *
from ..attrs cimport TAG
from ..attrs import intify_attrs
from ..util import SimpleFrozenDict
cdef class Retokenizer:
'''Helper class for doc.retokenize() context manager.'''
"""Helper class for doc.retokenize() context manager."""
cdef Doc doc
cdef list merges
cdef list splits
@ -24,14 +26,18 @@ cdef class Retokenizer:
self.merges = []
self.splits = []
def merge(self, Span span, attrs=None):
'''Mark a span for merging. The attrs will be applied to the resulting
token.'''
def merge(self, Span span, attrs=SimpleFrozenDict()):
"""Mark a span for merging. The attrs will be applied to the resulting
token.
"""
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.merges.append((span.start_char, span.end_char, attrs))
def split(self, Token token, orths, attrs=None):
'''Mark a Token for splitting, into the specified orths. The attrs
will be applied to each subtoken.'''
def split(self, Token token, orths, attrs=SimpleFrozenDict()):
"""Mark a Token for splitting, into the specified orths. The attrs
will be applied to each subtoken.
"""
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.splits.append((token.start_char, orths, attrs))
def __enter__(self):
@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes):
# Clear the cached Python objects
# Return the merged Python object
return doc[start]

View File

@ -635,3 +635,18 @@ def use_gpu(gpu_id):
def fix_random_seed(seed=0):
random.seed(seed)
numpy.random.seed(seed)
class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty
dictionary). Will raise an error if user or spaCy attempts to add to dict.
"""
def __setitem__(self, key, value):
raise NotImplementedError(Errors.E095)
def pop(self, key, default=None):
raise NotImplementedError(Errors.E095)
def update(self, other):
raise NotImplementedError(Errors.E095)