Merge branch 'develop'

This commit is contained in:
Matthew Honnibal 2017-03-10 02:49:39 -06:00
commit ea53647362
29 changed files with 535 additions and 138 deletions

View File

@ -66,8 +66,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg, def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0): n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
with Language.train(model_dir, train_data, with Language.train(model_dir, train_data,
tagger_cfg, parser_cfg, entity_cfg) as trainer: tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0 loss = 0
@ -76,11 +76,13 @@ def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, ent
for doc, gold in epoch: for doc, gold in epoch:
trainer.update(doc, gold) trainer.update(doc, gold)
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc) dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
print(format_str.format(itn, loss, **dev_scores.scores)) print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None, cand_preproc=None): beam_width=None, cand_preproc=None):
print("Load parser", model_dir)
nlp = Language(path=model_dir) nlp = Language(path=model_dir)
if nlp.lang == 'de': if nlp.lang == 'de':
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
@ -145,21 +147,25 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
verbose=("Verbose error reporting", "flag", "v", bool), verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool), debug=("Debug mode", "flag", "d", bool),
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool), pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
L1=("L1 regularization penalty", "option", "L", float),
) )
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
L1=1e-6):
parser_cfg = dict(locals()) parser_cfg = dict(locals())
tagger_cfg = dict(locals()) tagger_cfg = dict(locals())
entity_cfg = dict(locals()) entity_cfg = dict(locals())
lang = spacy.util.get_lang_class(language) lang = spacy.util.get_lang_class(language)
parser_cfg['features'] = lang.Defaults.parser_features parser_cfg['features'] = lang.Defaults.parser_features
entity_cfg['features'] = lang.Defaults.entity_features entity_cfg['features'] = lang.Defaults.entity_features
if not eval_only: if not eval_only:
gold_train = list(read_json_file(train_loc)) gold_train = list(read_json_file(train_loc))
gold_dev = list(read_json_file(dev_loc)) gold_dev = list(read_json_file(dev_loc))
if n_sents > 0:
gold_train = gold_train[:n_sents]
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
n_iter=n_iter) n_iter=n_iter)

View File

@ -10,3 +10,4 @@ six
ujson>=1.35 ujson>=1.35
cloudpickle cloudpickle
sputnik>=0.9.2,<0.10.0 sputnik>=0.9.2,<0.10.0
dill>=0.2,<0.3

View File

@ -241,7 +241,8 @@ def setup_package():
'cloudpickle', 'cloudpickle',
'pathlib', 'pathlib',
'sputnik>=0.9.2,<0.10.0', 'sputnik>=0.9.2,<0.10.0',
'ujson>=1.35'], 'ujson>=1.35',
'dill>=0.2,<0.3'],
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', 'Development Status :: 5 - Production/Stable',
'Environment :: Console', 'Environment :: Console',

View File

@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'''Normalize a dictionary of attributes, converting them to ints. '''Normalize a dictionary of attributes, converting them to ints.
Arguments: Arguments:
stringy_attrs (dict): stringy_attrs (dict):
Dictionary keyed by attribute string names. Values can be ints or strings. Dictionary keyed by attribute string names. Values can be ints or strings.
@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', 'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case'] 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', # U20
]
for key in morph_keys: for key in morph_keys:
if key in stringy_attrs: if key in stringy_attrs:
stringy_attrs.pop(key) stringy_attrs.pop(key)

View File

@ -4,6 +4,20 @@ from cymem.cymem cimport Pool
cdef class CFile: cdef class CFile:
cdef FILE* fp cdef FILE* fp
cdef bint is_open cdef bint is_open
cdef Pool mem
cdef int size # For compatibility with subclass
cdef int _capacity # For compatibility with subclass
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
cdef class StringCFile(CFile):
cdef unsigned char* data
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1

View File

@ -1,4 +1,5 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memcpy
cdef class CFile: cdef class CFile:
@ -9,6 +10,7 @@ cdef class CFile:
mode_str = mode mode_str = mode
if hasattr(loc, 'as_posix'): if hasattr(loc, 'as_posix'):
loc = loc.as_posix() loc = loc.as_posix()
self.mem = Pool()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode_str) self.fp = fopen(<char*>bytes_loc, mode_str)
if self.fp == NULL: if self.fp == NULL:
@ -45,3 +47,42 @@ cdef class CFile:
cdef bytes py_bytes = value.encode('utf8') cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars) self.write(sizeof(char), len(py_bytes), chars)
cdef class StringCFile:
def __init__(self, mode, bytes data=b'', on_open_error=None):
self.mem = Pool()
self.is_open = 'w' in mode
self._capacity = max(len(data), 8)
self.size = len(data)
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
for i in range(len(data)):
self.data[i] = data[i]
def close(self):
self.is_open = False
def string_data(self):
return (self.data-self.size)[:self.size]
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
memcpy(dest, self.data, elem_size * number)
self.data += elem_size * number
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
write_size = number * elem_size
if (self.size + write_size) >= self._capacity:
self._capacity = (self.size + write_size) * 2
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
memcpy(&self.data[self.size], src, elem_size * number)
self.size += write_size
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -41,7 +41,7 @@ TAG_MAP = {
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {POS: PART}, "PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"}, "PTKANT": {POS: PART, "PartType": "res"},
"PTKNEG": {POS: PART, "Negative": "yes"}, "PTKNEG": {POS: PART, "Polarity": "Neg"},
"PTKVZ": {POS: PART, "PartType": "vbp"}, "PTKVZ": {POS: PART, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"}, "PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"}, "PWAT": {POS: DET, "PronType": "int"},

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from os import path from os import path
from pathlib import Path
from ..util import match_best_version from ..util import match_best_version
from ..util import get_data_path from ..util import get_data_path
@ -13,6 +14,11 @@ from ..attrs import LANG
from .language_data import * from .language_data import *
try:
basestring
except NameError:
basestring = str
class English(Language): class English(Language):
lang = 'en' lang = 'en'
@ -43,14 +49,15 @@ def _fix_deprecated_glove_vectors_loading(overrides):
data_path = get_data_path() data_path = get_data_path()
else: else:
path = overrides['path'] path = overrides['path']
if isinstance(path, basestring):
path = Path(path)
data_path = path.parent data_path = path.parent
vec_path = None vec_path = None
if 'add_vectors' not in overrides: if 'add_vectors' not in overrides:
if 'vectors' in overrides: if 'vectors' in overrides:
vec_path = match_best_version(overrides['vectors'], None, data_path) vec_path = match_best_version(overrides['vectors'], None, data_path)
if vec_path is None: if vec_path is None:
raise IOError( return overrides
'Could not load data pack %s from %s' % (overrides['vectors'], data_path))
else: else:
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path) vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
if vec_path is not None: if vec_path is not None:

View File

@ -16,7 +16,7 @@ TAG_MAP = {
"$": {POS: SYM, "Other": {"SymType": "currency"}}, "$": {POS: SYM, "Other": {"SymType": "currency"}},
"#": {POS: SYM, "Other": {"SymType": "numbersign"}}, "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
"AFX": {POS: ADJ, "Hyph": "yes"}, "AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CONJ, "ConjType": "coor"}, "CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"}, "CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET}, "DT": {POS: DET},
"EX": {POS: ADV, "AdvType": "ex"}, "EX": {POS: ADV, "AdvType": "ex"},

View File

@ -5,7 +5,7 @@ import pathlib
from contextlib import contextmanager from contextlib import contextmanager
import shutil import shutil
import ujson as json import ujson
try: try:
@ -13,6 +13,10 @@ try:
except NameError: except NameError:
basestring = str basestring = str
try:
unicode
except NameError:
unicode = str
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
@ -226,12 +230,21 @@ class Language(object):
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
with (dep_model_dir / 'config.json').open('w') as file_: with (dep_model_dir / 'config.json').open('wb') as file_:
json.dump(parser_cfg, file_) data = ujson.dumps(parser_cfg)
with (ner_model_dir / 'config.json').open('w') as file_: if isinstance(data, unicode):
json.dump(entity_cfg, file_) data = data.encode('utf8')
with (pos_model_dir / 'config.json').open('w') as file_: file_.write(data)
json.dump(tagger_cfg, file_) with (ner_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(entity_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
with (pos_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(tagger_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
self = cls( self = cls(
path=path, path=path,
@ -252,7 +265,7 @@ class Language(object):
self.entity = self.Defaults.create_entity(self) self.entity = self.Defaults.create_entity(self)
self.pipeline = self.Defaults.create_pipeline(self) self.pipeline = self.Defaults.create_pipeline(self)
yield Trainer(self, gold_tuples) yield Trainer(self, gold_tuples)
self.end_training() self.end_training(path=path)
def __init__(self, **overrides): def __init__(self, **overrides):
if 'data_dir' in overrides and 'path' not in overrides: if 'data_dir' in overrides and 'path' not in overrides:
@ -391,12 +404,14 @@ class Language(object):
else: else:
entity_iob_freqs = [] entity_iob_freqs = []
entity_type_freqs = [] entity_type_freqs = []
with (path / 'vocab' / 'serializer.json').open('w') as file_: with (path / 'vocab' / 'serializer.json').open('wb') as file_:
file_.write( data = ujson.dumps([
json.dumps([ (TAG, tagger_freqs),
(TAG, tagger_freqs), (DEP, dep_freqs),
(DEP, dep_freqs), (ENT_IOB, entity_iob_freqs),
(ENT_IOB, entity_iob_freqs), (ENT_TYPE, entity_type_freqs),
(ENT_TYPE, entity_type_freqs), (HEAD, head_freqs)
(HEAD, head_freqs) ])
])) if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)

View File

@ -19,6 +19,7 @@ TAG_MAP = {
"AUX": {POS: AUX}, "AUX": {POS: AUX},
"X": {POS: X}, "X": {POS: X},
"CONJ": {POS: CONJ}, "CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ}, # U20
"ADJ": {POS: ADJ}, "ADJ": {POS: ADJ},
"VERB": {POS: VERB}, "VERB": {POS: VERB},
"PART": {POS: PART} "PART": {POS: PART}

View File

@ -37,7 +37,7 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
Definite_two Definite_two
Definite_def Definite_def
Definite_red Definite_red
Definite_cons # U20
Definite_ind Definite_ind
Degree_cmp Degree_cmp
Degree_comp Degree_comp
@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
Negative_neg Negative_neg
Negative_pos Negative_pos
Negative_yes Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com Number_com
Number_dual Number_dual
Number_none Number_none
@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
VerbForm_partPres VerbForm_partPres
VerbForm_sup VerbForm_sup
VerbForm_trans VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la VerbForm_gdv # la
Voice_act Voice_act
Voice_cau Voice_cau

View File

@ -192,6 +192,7 @@ IDS = {
"Definite_two": Definite_two, "Definite_two": Definite_two,
"Definite_def": Definite_def, "Definite_def": Definite_def,
"Definite_red": Definite_red, "Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind, "Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp, "Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp, "Degree_comp": Degree_comp,
@ -215,6 +216,8 @@ IDS = {
"Negative_neg": Negative_neg, "Negative_neg": Negative_neg,
"Negative_pos": Negative_pos, "Negative_pos": Negative_pos,
"Negative_yes": Negative_yes, "Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com, "Number_com": Number_com,
"Number_dual": Number_dual, "Number_dual": Number_dual,
"Number_none": Number_none, "Number_none": Number_none,
@ -263,6 +266,7 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres, "VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup, "VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans, "VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la, "VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act, "Voice_act": Voice_act,
"Voice_cau": Voice_cau, "Voice_cau": Voice_cau,

View File

@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
ADV ADV
AUX AUX
CONJ CONJ
CCONJ # U20
DET DET
INTJ INTJ
NOUN NOUN

View File

@ -7,7 +7,8 @@ IDS = {
"ADP": ADP, "ADP": ADP,
"ADV": ADV, "ADV": ADV,
"AUX": AUX, "AUX": AUX,
"CONJ": CONJ, "CONJ": CONJ, # U20
"CCONJ": CCONJ,
"DET": DET, "DET": DET,
"INTJ": INTJ, "INTJ": INTJ,
"NOUN": NOUN, "NOUN": NOUN,

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals, absolute_import
cimport cython cimport cython
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
@ -12,22 +12,19 @@ from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
try: import ujson
import ujson as json
except ImportError:
import json
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
chars = string.encode('utf8') chars = string.encode('utf8')
return _hash_utf8(chars, len(chars)) return hash_utf8(chars, len(chars))
cdef hash_t _hash_utf8(char* utf8_string, int length): cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
return hash64(utf8_string, length, 1) return hash64(utf8_string, length, 1)
cdef uint32_t _hash32_utf8(char* utf8_string, int length): cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1) return hash32(utf8_string, length, 1)
@ -48,11 +45,11 @@ cdef unicode _decode(const Utf8Str* string):
return string.p[i:length + i].decode('utf8') return string.p[i:length + i].decode('utf8')
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *: cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef int n_length_bytes cdef int n_length_bytes
cdef int i cdef int i
cdef Utf8Str string cdef Utf8Str string
assert length != 0 cdef uint32_t ulength = length
if length < sizeof(string.s): if length < sizeof(string.s):
string.s[0] = <unsigned char>length string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length) memcpy(&string.s[1], chars, length)
@ -98,6 +95,14 @@ cdef class StringStore:
def __get__(self): def __get__(self):
return self.size -1 return self.size -1
def __reduce__(self):
# TODO: OOV words, for the is_frozen stuff?
if self.is_frozen:
raise NotImplementedError(
"Currently missing support for pickling StringStore when "
"is_frozen=True")
return (StringStore, (list(self),))
def __len__(self): def __len__(self):
"""The number of strings in the store. """The number of strings in the store.
@ -149,7 +154,7 @@ cdef class StringStore:
# pretty bad. # pretty bad.
# We could also get unlucky here, and hash into a value that # We could also get unlucky here, and hash into a value that
# collides with the 'real' strings. # collides with the 'real' strings.
return _hash32_utf8(byte_string, len(byte_string)) return hash32_utf8(byte_string, len(byte_string))
else: else:
return utf8str - self.c return utf8str - self.c
@ -200,7 +205,7 @@ cdef class StringStore:
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length): cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
# TODO: This function's API/behaviour is an unholy mess... # TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = _hash_utf8(utf8_string, length) cdef hash_t key = hash_utf8(utf8_string, length)
cdef Utf8Str* value = <Utf8Str*>self._map.get(key) cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL: if value is not NULL:
return value return value
@ -209,7 +214,7 @@ cdef class StringStore:
return value return value
if self.is_frozen: if self.is_frozen:
# OOV store uses 32 bit hashes. Pretty ugly :( # OOV store uses 32 bit hashes. Pretty ugly :(
key32 = _hash32_utf8(utf8_string, length) key32 = hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial # Important: Make the OOV store own the memory. That way it's trivial
# to flush them all. # to flush them all.
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str)) value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
@ -232,7 +237,7 @@ cdef class StringStore:
Returns: Returns:
None None
""" """
string_data = json.dumps(list(self)) string_data = ujson.dumps(list(self))
if not isinstance(string_data, unicode): if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8') string_data = string_data.decode('utf8')
# TODO: OOV? # TODO: OOV?
@ -246,7 +251,7 @@ cdef class StringStore:
Returns: Returns:
None None
""" """
strings = json.load(file_) strings = ujson.load(file_)
if strings == ['']: if strings == ['']:
return None return None
cdef unicode string cdef unicode string
@ -271,7 +276,7 @@ cdef class StringStore:
# Find array index with pointer arithmetic # Find array index with pointer arithmetic
offset = ((<Utf8Str*>value) - self.c) offset = ((<Utf8Str*>value) - self.c)
keys[offset] = key keys[offset] = key
self._resize_at *= 2 self._resize_at *= 2
cdef size_t new_size = self._resize_at * sizeof(Utf8Str) cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size) self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)

View File

@ -13,7 +13,7 @@ cpdef enum symbol_t:
LIKE_EMAIL LIKE_EMAIL
IS_STOP IS_STOP
IS_OOV IS_OOV
FLAG14 = 14 FLAG14 = 14
FLAG15 FLAG15
FLAG16 FLAG16
@ -90,6 +90,7 @@ cpdef enum symbol_t:
ADV ADV
AUX AUX
CONJ CONJ
CCONJ # U20
DET DET
INTJ INTJ
NOUN NOUN
@ -107,11 +108,14 @@ cpdef enum symbol_t:
Animacy_anim Animacy_anim
Animacy_inam Animacy_inam
Animacy_hum # U20
Aspect_freq Aspect_freq
Aspect_imp Aspect_imp
Aspect_mod Aspect_mod
Aspect_none Aspect_none
Aspect_perf Aspect_perf
Aspect_iter # U20
Aspect_hab # U20
Case_abe Case_abe
Case_abl Case_abl
Case_abs Case_abs
@ -120,10 +124,12 @@ cpdef enum symbol_t:
Case_all Case_all
Case_cau Case_cau
Case_com Case_com
Case_cmp # U20
Case_dat Case_dat
Case_del Case_del
Case_dis Case_dis
Case_ela Case_ela
Case_equ # U20
Case_ess Case_ess
Case_gen Case_gen
Case_ill Case_ill
@ -142,7 +148,9 @@ cpdef enum symbol_t:
Definite_two Definite_two
Definite_def Definite_def
Definite_red Definite_red
Definite_cons # U20
Definite_ind Definite_ind
Definite_spec # U20
Degree_cmp Degree_cmp
Degree_comp Degree_comp
Degree_none Degree_none
@ -151,6 +159,8 @@ cpdef enum symbol_t:
Degree_abs Degree_abs
Degree_com Degree_com
Degree_dim # du Degree_dim # du
Degree_equ # U20
Evident_nfh # U20
Gender_com Gender_com
Gender_fem Gender_fem
Gender_masc Gender_masc
@ -162,16 +172,21 @@ cpdef enum symbol_t:
Mood_pot Mood_pot
Mood_sub Mood_sub
Mood_opt Mood_opt
Mood_prp # U20
Mood_adm # U20
Negative_neg Negative_neg
Negative_pos Negative_pos
Negative_yes Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com Number_com
Number_dual Number_dual
Number_none Number_none
Number_plur Number_plur
Number_sing Number_sing
Number_ptan # bg Number_ptan # bg
Number_count # bg Number_count # bg, U20
Number_tri # U20
NumType_card NumType_card
NumType_dist NumType_dist
NumType_frac NumType_frac
@ -197,7 +212,8 @@ cpdef enum symbol_t:
PronType_rel PronType_rel
PronType_tot PronType_tot
PronType_clit PronType_clit
PronType_exc # es, ca, it, fa PronType_exc # es, ca, it, fa, U20
PronType_emp # U20
Reflex_yes Reflex_yes
Tense_fut Tense_fut
Tense_imp Tense_imp
@ -213,12 +229,17 @@ cpdef enum symbol_t:
VerbForm_partPres VerbForm_partPres
VerbForm_sup VerbForm_sup
VerbForm_trans VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la VerbForm_gdv # la
VerbForm_vnoun # U20
Voice_act Voice_act
Voice_cau Voice_cau
Voice_pass Voice_pass
Voice_mid # gkc Voice_mid # gkc, U20
Voice_int # hb Voice_int # hb
Voice_antip # U20
Voice_dir # U20
Voice_inv # U20
Abbr_yes # cz, fi, sl, U Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U AdpType_prep # cz, U
AdpType_post # U AdpType_post # U
@ -284,6 +305,10 @@ cpdef enum symbol_t:
Number_psee_plur # U Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U Number_psor_plur # cz, fi, sl, U
Number_pauc # U20
Number_grpa # U20
Number_grpl # U20
Number_inv # U20
NumForm_digit # cz, sl, U NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U NumForm_word # cz, sl, U
@ -311,6 +336,8 @@ cpdef enum symbol_t:
Person_psor_one # fi, U Person_psor_one # fi, U
Person_psor_two # fi, U Person_psor_two # fi, U
Person_psor_three # fi, U Person_psor_three # fi, U
Person_zero # U20
Person_four # U20
Polite_inf # bq, U Polite_inf # bq, U
Polite_pol # bq, U Polite_pol # bq, U
Polite_abs_inf # bq, U Polite_abs_inf # bq, U
@ -319,6 +346,10 @@ cpdef enum symbol_t:
Polite_erg_pol # bq, U Polite_erg_pol # bq, U
Polite_dat_inf # bq, U Polite_dat_inf # bq, U
Polite_dat_pol # bq, U Polite_dat_pol # bq, U
Polite_infm # U20
Polite_form # U20
Polite_form_elev # U20
Polite_form_humb # U20
Prefix_yes # U Prefix_yes # U
PrepCase_npr # cz PrepCase_npr # cz
PrepCase_pre # U PrepCase_pre # U
@ -383,6 +414,7 @@ cpdef enum symbol_t:
ccomp ccomp
complm complm
conj conj
cop # U20
csubj csubj
csubjpass csubjpass
dep dep
@ -405,6 +437,8 @@ cpdef enum symbol_t:
num num
number number
oprd oprd
obj # U20
obl # U20
parataxis parataxis
partmod partmod
pcomp pcomp

View File

@ -91,6 +91,7 @@ IDS = {
"ADV": ADV, "ADV": ADV,
"AUX": AUX, "AUX": AUX,
"CONJ": CONJ, "CONJ": CONJ,
"CCONJ": CCONJ, # U20
"DET": DET, "DET": DET,
"INTJ": INTJ, "INTJ": INTJ,
"NOUN": NOUN, "NOUN": NOUN,
@ -108,11 +109,14 @@ IDS = {
"Animacy_anim": Animacy_anim, "Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam, "Animacy_inam": Animacy_inam,
"Animacy_hum": Animacy_hum, # U20
"Aspect_freq": Aspect_freq, "Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp, "Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod, "Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none, "Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf, "Aspect_perf": Aspect_perf,
"Aspect_iter": Aspect_iter, # U20
"Aspect_hab": Aspect_hab, # U20
"Case_abe": Case_abe, "Case_abe": Case_abe,
"Case_abl": Case_abl, "Case_abl": Case_abl,
"Case_abs": Case_abs, "Case_abs": Case_abs,
@ -121,10 +125,12 @@ IDS = {
"Case_all": Case_all, "Case_all": Case_all,
"Case_cau": Case_cau, "Case_cau": Case_cau,
"Case_com": Case_com, "Case_com": Case_com,
"Case_cmp": Case_cmp, # U20
"Case_dat": Case_dat, "Case_dat": Case_dat,
"Case_del": Case_del, "Case_del": Case_del,
"Case_dis": Case_dis, "Case_dis": Case_dis,
"Case_ela": Case_ela, "Case_ela": Case_ela,
"Case_equ": Case_equ, # U20
"Case_ess": Case_ess, "Case_ess": Case_ess,
"Case_gen": Case_gen, "Case_gen": Case_gen,
"Case_ill": Case_ill, "Case_ill": Case_ill,
@ -143,7 +149,9 @@ IDS = {
"Definite_two": Definite_two, "Definite_two": Definite_two,
"Definite_def": Definite_def, "Definite_def": Definite_def,
"Definite_red": Definite_red, "Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind, "Definite_ind": Definite_ind,
"Definite_spec": Definite_spec, # U20
"Degree_cmp": Degree_cmp, "Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp, "Degree_comp": Degree_comp,
"Degree_none": Degree_none, "Degree_none": Degree_none,
@ -152,6 +160,8 @@ IDS = {
"Degree_abs": Degree_abs, "Degree_abs": Degree_abs,
"Degree_com": Degree_com, "Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du "Degree_dim ": Degree_dim, # du
"Degree_equ": Degree_equ, # U20
"Evident_nfh": Evident_nfh, # U20
"Gender_com": Gender_com, "Gender_com": Gender_com,
"Gender_fem": Gender_fem, "Gender_fem": Gender_fem,
"Gender_masc": Gender_masc, "Gender_masc": Gender_masc,
@ -163,16 +173,21 @@ IDS = {
"Mood_pot": Mood_pot, "Mood_pot": Mood_pot,
"Mood_sub": Mood_sub, "Mood_sub": Mood_sub,
"Mood_opt": Mood_opt, "Mood_opt": Mood_opt,
"Mood_prp": Mood_prp, # U20
"Mood_adm": Mood_adm, # U20
"Negative_neg": Negative_neg, "Negative_neg": Negative_neg,
"Negative_pos": Negative_pos, "Negative_pos": Negative_pos,
"Negative_yes": Negative_yes, "Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com, "Number_com": Number_com,
"Number_dual": Number_dual, "Number_dual": Number_dual,
"Number_none": Number_none, "Number_none": Number_none,
"Number_plur": Number_plur, "Number_plur": Number_plur,
"Number_sing": Number_sing, "Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg "Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg "Number_count ": Number_count, # bg, U20
"Number_tri": Number_tri, # U20
"NumType_card": NumType_card, "NumType_card": NumType_card,
"NumType_dist": NumType_dist, "NumType_dist": NumType_dist,
"NumType_frac": NumType_frac, "NumType_frac": NumType_frac,
@ -198,7 +213,8 @@ IDS = {
"PronType_rel": PronType_rel, "PronType_rel": PronType_rel,
"PronType_tot": PronType_tot, "PronType_tot": PronType_tot,
"PronType_clit": PronType_clit, "PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa, "PronType_exc": PronType_exc, # es, ca, it, fa, U20
"PronType_emp": PronType_emp, # U20
"Reflex_yes": Reflex_yes, "Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut, "Tense_fut": Tense_fut,
"Tense_imp": Tense_imp, "Tense_imp": Tense_imp,
@ -214,12 +230,17 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres, "VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup, "VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans, "VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la, "VerbForm_gdv ": VerbForm_gdv, # la,
"VerbForm_vnoun": VerbForm_vnoun, # U20
"Voice_act": Voice_act, "Voice_act": Voice_act,
"Voice_cau": Voice_cau, "Voice_cau": Voice_cau,
"Voice_pass": Voice_pass, "Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc, "Voice_mid ": Voice_mid, # gkc, U20
"Voice_int ": Voice_int, # hb, "Voice_int ": Voice_int, # hb,
"Voice_antip": Voice_antip, # U20
"Voice_dir": Voice_dir, # U20
"Voice_inv": Voice_inv, # U20
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U, "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U, "AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U, "AdpType_post ": AdpType_post, # U,
@ -285,6 +306,10 @@ IDS = {
"Number_psee_plur ": Number_psee_plur, # U, "Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"Number_pauc": Number_pauc, # U20
"Number_grpa": Number_grpa, # U20
"Number_grpl": Number_grpl, # U20
"Number_inv": Number_inv, # U20
"NumForm_digit ": NumForm_digit, # cz, sl, U, "NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U, "NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U, "NumForm_word ": NumForm_word, # cz, sl, U,
@ -312,6 +337,8 @@ IDS = {
"Person_psor_one ": Person_psor_one, # fi, U, "Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U, "Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U, "Person_psor_three ": Person_psor_three, # fi, U,
"Person_zero ": Person_zero, # U20
"Person_four ": Person_four, # U20
"Polite_inf ": Polite_inf, # bq, U, "Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U, "Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U, "Polite_abs_inf ": Polite_abs_inf, # bq, U,
@ -320,6 +347,10 @@ IDS = {
"Polite_erg_pol ": Polite_erg_pol, # bq, U, "Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U, "Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U, "Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Polite_infm ": Polite_infm, # U20
"Polite_form ": Polite_form, # U20
"Polite_form_elev ": Polite_form_elev, # U20
"Polite_form_humb ": Polite_form_humb, # U20
"Prefix_yes ": Prefix_yes, # U, "Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz, "PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U, "PrepCase_pre ": PrepCase_pre, # U,
@ -384,6 +415,7 @@ IDS = {
"ccomp": ccomp, "ccomp": ccomp,
"complm": complm, "complm": complm,
"conj": conj, "conj": conj,
"cop": cop, # U20
"csubj": csubj, "csubj": csubj,
"csubjpass": csubjpass, "csubjpass": csubjpass,
"dep": dep, "dep": dep,
@ -406,6 +438,8 @@ IDS = {
"num": num, "num": num,
"number": number, "number": number,
"oprd": oprd, "oprd": oprd,
"obj": obj, # U20
"obl": obl, # U20
"parataxis": parataxis, "parataxis": parataxis,
"partmod": partmod, "partmod": partmod,
"pcomp": pcomp, "pcomp": pcomp,

View File

@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD
def ancestors(tokenid, heads): def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root # returns all words going from the word up the path to the root
# the path to root cannot be longer than the number of words in the sentence # the path to root cannot be longer than the number of words in the sentence
# this function ends after at most len(heads) steps # this function ends after at most len(heads) steps
# because it would otherwise loop indefinitely on cycles # because it would otherwise loop indefinitely on cycles
head = tokenid head = tokenid
cnt = 0 cnt = 0
@ -180,7 +180,7 @@ class PseudoProjectivity:
next_queue = [] next_queue = []
for qtoken in queue: for qtoken in queue:
for child in qtoken.children: for child in qtoken.children:
if child.is_space: continue if child.is_space: continue
if child == token: continue if child == token: continue
if child.dep_ == headlabel: if child.dep_ == headlabel:
return child return child

View File

@ -68,7 +68,7 @@ def get_templates(name):
cdef class ParserModel(AveragedPerceptron): cdef class ParserModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
fill_context(eg.atoms, state) fill_context(eg.atoms, state)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
@ -124,6 +124,8 @@ cdef class Parser:
elif 'features' not in cfg: elif 'features' not in cfg:
cfg['features'] = self.feature_templates cfg['features'] = self.feature_templates
self.model = ParserModel(cfg['features']) self.model = ParserModel(cfg['features'])
self.model.l1_penalty = cfg.get('L1', 0.0)
self.cfg = cfg self.cfg = cfg
def __reduce__(self): def __reduce__(self):
@ -232,7 +234,7 @@ cdef class Parser:
free(eg.scores) free(eg.scores)
free(eg.is_valid) free(eg.is_valid)
return 0 return 0
def update(self, Doc tokens, GoldParse gold): def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model. """Update the statistical model.
@ -258,15 +260,20 @@ cdef class Parser:
self.model.set_featuresC(&eg.c, stcls.c) self.model.set_featuresC(&eg.c, stcls.c)
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
self.model.updateC(&eg.c) self.model.time += 1
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
if eg.c.costs[guess] > 0:
best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
for feat in eg.c.features[:eg.c.nr_feat]:
self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.c.costs[guess])
self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.c.costs[guess])
action = self.moves.c[eg.guess] action = self.moves.c[guess]
action.do(stcls.c, action.label) action.do(stcls.c, action.label)
loss += eg.costs[eg.guess] loss += eg.costs[guess]
eg.fill_scores(0, eg.nr_class) eg.fill_scores(0, eg.c.nr_class)
eg.fill_costs(0, eg.nr_class) eg.fill_costs(0, eg.c.nr_class)
eg.fill_is_valid(1, eg.nr_class) eg.fill_is_valid(1, eg.c.nr_class)
return loss return loss
def step_through(self, Doc doc): def step_through(self, Doc doc):
@ -296,7 +303,7 @@ cdef class Parser:
# Doesn't set label into serializer -- subclasses override it to do that. # Doesn't set label into serializer -- subclasses override it to do that.
for action in self.moves.action_types: for action in self.moves.action_types:
self.moves.add_action(action, label) self.moves.add_action(action, label)
cdef class StepwiseState: cdef class StepwiseState:
cdef readonly StateClass stcls cdef readonly StateClass stcls
@ -385,6 +392,14 @@ class ParserStateError(ValueError):
"Please include the text that the parser failed on, which is:\n" "Please include the text that the parser failed on, which is:\n"
"%s" % repr(doc.text)) "%s" % repr(doc.text))
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil:
cdef int best = -1
for i in range(n):
if costs[i] <= 0:
if best == -1 or scores[i] > scores[best]:
best = i
return best
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
int nr_class) except -1: int nr_class) except -1:

View File

@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .gold cimport GoldParse from .gold cimport GoldParse
from .attrs cimport * from .attrs cimport *
cpdef enum: cpdef enum:
P2_orth P2_orth
P2_cluster P2_cluster
@ -71,7 +71,7 @@ cpdef enum:
cdef class TaggerModel(AveragedPerceptron): cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
_fill_from_token(&eg.atoms[W_orth], &tokens[i]) _fill_from_token(&eg.atoms[W_orth], &tokens[i])
@ -152,6 +152,7 @@ cdef class Tagger:
model = TaggerModel(cfg.get('features', self.feature_templates)) model = TaggerModel(cfg.get('features', self.feature_templates))
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.model.l1_penalty = 0.0
# TODO: Move this to tag map # TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)} self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names: for tag in self.tag_names:
@ -191,7 +192,7 @@ cdef class Tagger:
nr_class=self.vocab.morphology.n_tags, nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat) nr_feat=self.model.nr_feat)
for i in range(tokens.length): for i in range(tokens.length):
if tokens.c[i].pos == 0: if tokens.c[i].pos == 0:
self.model.set_featuresC(&eg.c, tokens.c, i) self.model.set_featuresC(&eg.c, tokens.c, i)
self.model.set_scoresC(eg.c.scores, self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat) eg.c.features, eg.c.nr_feat)
@ -217,7 +218,7 @@ cdef class Tagger:
for doc in stream: for doc in stream:
self(doc) self(doc)
yield doc yield doc
def update(self, Doc tokens, GoldParse gold): def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model, with tags supplied for the given document. """Update the statistical model, with tags supplied for the given document.
@ -251,7 +252,7 @@ cdef class Tagger:
self.model.updateC(&eg.c) self.model.updateC(&eg.c)
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
correct += eg.cost == 0 correct += eg.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1 self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class) eg.fill_scores(0, eg.c.nr_class)

View File

@ -4,9 +4,15 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text): def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after parsing.""" """Test for Issue #792: Trailing whitespace is removed after tokenization."""
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert doc.text_with_ws == text assert ''.join([token.text_with_ws for token in doc]) == text
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text):
"""Test base case for Issue #792: Non-trailing whitespace"""
doc = en_tokenizer(text)
assert ''.join([token.text_with_ws for token in doc]) == text

View File

@ -0,0 +1,52 @@
'''
Test Matcher matches with '*' operator and Boolean flag
'''
from __future__ import unicode_literals
from __future__ import print_function
import pytest
from ...matcher import Matcher
from ...vocab import Vocab
from ...attrs import LOWER
from ...tokens import Doc
def test_basic_case():
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern(
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', LOWER: 'and'},
{LOWER: 'frank'}
])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc)
assert len(match) == 1
ent_id, label, start, end = match[0]
assert start == 0
assert end == 4
@pytest.mark.xfail
def test_issue850():
'''The problem here is that the variable-length pattern matches the
succeeding token. We then don't handle the ambiguity correctly.'''
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern(
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', IS_ANY_TOKEN: True},
{LOWER: 'frank'}
])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc)
assert len(match) == 1
ent_id, label, start, end = match[0]
assert start == 0
assert end == 4

View File

@ -0,0 +1,12 @@
# encoding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
"aaabbb@ccc.com \nThank you!"])
def test_issue859(en_tokenizer, text):
"""Test that no extra space is added in doc.text method."""
doc = en_tokenizer(text)
assert doc.text == text

View File

@ -0,0 +1,40 @@
from __future__ import unicode_literals
import io
import pytest
import dill as pickle
from ..strings import StringStore
from ..vocab import Vocab
from ..attrs import NORM
def test_pickle_string_store():
sstore = StringStore()
hello = sstore['hello']
bye = sstore['bye']
bdata = pickle.dumps(sstore, protocol=-1)
unpickled = pickle.loads(bdata)
assert unpickled['hello'] == hello
assert unpickled['bye'] == bye
assert len(sstore) == len(unpickled)
@pytest.mark.xfail
def test_pickle_vocab():
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
dog = vocab[u'dog']
cat = vocab[u'cat']
assert dog.norm_ == 'do'
assert cat.norm_ == 'ca'
bdata = pickle.dumps(vocab)
unpickled = pickle.loads(bdata)
assert unpickled[u'dog'].orth == dog.orth
assert unpickled[u'cat'].orth == cat.orth
assert unpickled[u'dog'].norm == dog.norm
assert unpickled[u'cat'].norm == cat.norm
dog_ = unpickled[u'dog']
cat_ = unpickled[u'cat']
assert dog_.norm != cat_.norm

View File

@ -163,7 +163,6 @@ cdef class Tokenizer:
start = i start = i
in_ws = not in_ws in_ws = not in_ws
i += 1 i += 1
i += 1
if start < i: if start < i:
span = string[start:] span = string[start:]
key = hash_string(span) key = hash_string(span)
@ -275,7 +274,10 @@ cdef class Tokenizer:
if cache_hit: if cache_hit:
pass pass
elif self.token_match and self.token_match(string): elif self.token_match and self.token_match(string):
tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else: else:
matches = self.find_infix(string) matches = self.find_infix(string)
if not matches: if not matches:

View File

@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .span cimport Span from .span cimport Span
@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc: cdef class Doc:
""" """
A sequence of `Token` objects. Access sentences and named entities, A sequence of `Token` objects. Access sentences and named entities,
export annotations to numpy arrays, losslessly serialize to compressed export annotations to numpy arrays, losslessly serialize to compressed
binary strings. binary strings.
Aside: Internals Aside: Internals
The `Doc` object holds an array of `TokenC` structs. The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves. array, i.e. they don't own the data themselves.
Code: Construction 1 Code: Construction 1
@ -80,13 +80,13 @@ cdef class Doc:
Create a Doc object. Create a Doc object.
Aside: Implementation Aside: Implementation
This method of constructing a `Doc` object is usually only used This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via for deserialization. Standard usage is to construct the document via
a call to the language object. a call to the language object.
Arguments: Arguments:
vocab: vocab:
A Vocabulary object, which must match any models you want to A Vocabulary object, which must match any models you want to
use (e.g. tokenizer, parser, entity recognizer). use (e.g. tokenizer, parser, entity recognizer).
words: words:
@ -156,19 +156,19 @@ cdef class Doc:
if self.length == 0: if self.length == 0:
self.is_tagged = True self.is_tagged = True
self.is_parsed = True self.is_parsed = True
def __getitem__(self, object i): def __getitem__(self, object i):
''' '''
doc[i] doc[i]
Get the Token object at position i, where i is an integer. Get the Token object at position i, where i is an integer.
Negative indexing is supported, and follows the usual Python Negative indexing is supported, and follows the usual Python
semantics, i.e. doc[-2] is doc[len(doc) - 2]. semantics, i.e. doc[-2] is doc[len(doc) - 2].
doc[start : end]] doc[start : end]]
Get a `Span` object, starting at position `start` Get a `Span` object, starting at position `start`
and ending at position `end`, where `start` and and ending at position `end`, where `start` and
`end` are token indices. For instance, `end` are token indices. For instance,
`doc[2:5]` produces a span consisting of `doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
are not supported, as `Span` objects must be contiguous (cannot have gaps). are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their You can use negative indices and open-ended ranges, which have their
normal Python semantics. normal Python semantics.
@ -188,11 +188,11 @@ cdef class Doc:
def __iter__(self): def __iter__(self):
''' '''
for token in doc for token in doc
Iterate over `Token` objects, from which the annotations can Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token be easily accessed. This is the main way of accessing Token
objects, which are the main way annotations are accessed from objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the instead access the annotations as a numpy array, or access the
underlying C data directly from Cython. underlying C data directly from Cython.
''' '''
cdef int i cdef int i
@ -251,13 +251,13 @@ cdef class Doc:
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks: if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self) return self.user_hooks['has_vector'](self)
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
''' '''
A real-valued meaning representation. Defaults to an average of the token vectors. A real-valued meaning representation. Defaults to an average of the token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32'] Type: numpy.ndarray[ndim=1, dtype='float32']
''' '''
def __get__(self): def __get__(self):
@ -285,14 +285,14 @@ cdef class Doc:
norm += value * value norm += value * value
self._vector_norm = sqrt(norm) if norm != 0 else 0 self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm return self._vector_norm
def __set__(self, value): def __set__(self, value):
self._vector_norm = value self._vector_norm = value
@property @property
def string(self): def string(self):
return self.text return self.text
property text: property text:
'''A unicode representation of the document text.''' '''A unicode representation of the document text.'''
def __get__(self): def __get__(self):
@ -306,7 +306,7 @@ cdef class Doc:
property ents: property ents:
''' '''
Yields named-entity `Span` objects, if the entity recognizer Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get has been applied to the document. Iterate over the span to get
individual Token objects, or access the label: individual Token objects, or access the label:
Example: Example:
@ -352,7 +352,7 @@ cdef class Doc:
cdef int i cdef int i
for i in range(self.length): for i in range(self.length):
self.c[i].ent_type = 0 self.c[i].ent_type = 0
# At this point we don't know whether the NER has run over the # At this point we don't know whether the NER has run over the
# Doc. If the ent_iob is missing, leave it missing. # Doc. If the ent_iob is missing, leave it missing.
if self.c[i].ent_iob != 0: if self.c[i].ent_iob != 0:
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
@ -384,9 +384,9 @@ cdef class Doc:
property noun_chunks: property noun_chunks:
''' '''
Yields base noun-phrase #[code Span] objects, if the document Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to 'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example: phrases, and no relative clauses. For example:
''' '''
def __get__(self): def __get__(self):
@ -422,7 +422,7 @@ cdef class Doc:
def __get__(self): def __get__(self):
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
return self.user_hooks['sents'](self) return self.user_hooks['sents'](self)
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
"sentence boundary detection requires the dependency parse, which " "sentence boundary detection requires the dependency parse, which "
@ -465,8 +465,8 @@ cdef class Doc:
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
""" """
Given a list of M attribute IDs, export the tokens to a numpy Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length `ndarray` of shape (N, M), where `N` is the length
of the document. The values will be 32-bit integers. of the document. The values will be 32-bit integers.
Example: Example:
@ -474,7 +474,7 @@ cdef class Doc:
doc = nlp(text) doc = nlp(text)
# All strings mapped to integers, for easy export to numpy # All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
Arguments: Arguments:
attr_ids (list[int]): A list of attribute ID ints. attr_ids (list[int]): A list of attribute ID ints.
@ -520,7 +520,7 @@ cdef class Doc:
cdef int i cdef int i
cdef attr_t attr cdef attr_t attr
cdef size_t count cdef size_t count
if counts is None: if counts is None:
counts = PreshCounter() counts = PreshCounter()
output_dict = True output_dict = True
@ -570,7 +570,7 @@ cdef class Doc:
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
cdef int length = len(array) cdef int length = len(array)
cdef attr_t[:] values cdef attr_t[:] values
for col, attr_id in enumerate(attrs): for col, attr_id in enumerate(attrs):
values = array[:, col] values = array[:, col]
if attr_id == HEAD: if attr_id == HEAD:
for i in range(length): for i in range(length):
@ -612,11 +612,11 @@ cdef class Doc:
'''Deserialize, loading from bytes.''' '''Deserialize, loading from bytes.'''
self.vocab.serializer.unpack_into(data[4:], self) self.vocab.serializer.unpack_into(data[4:], self)
return self return self
@staticmethod @staticmethod
def read_bytes(file_): def read_bytes(file_):
''' '''
A static method, used to read serialized #[code Doc] objects from A static method, used to read serialized #[code Doc] objects from
a file. For example: a file. For example:
Example: Example:
@ -673,7 +673,7 @@ cdef class Doc:
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n" "Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
cdef int start = token_by_start(self.c, self.length, start_idx) cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1: if start == -1:
return None return None
@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
if child.l_edge < head.l_edge: if child.l_edge < head.l_edge:
head.l_edge = child.l_edge head.l_edge = child.l_edge
head.l_kids += 1 head.l_kids += 1
# Set right edges --- same as above, but iterate in reverse # Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1): for i in range(length-1, -1, -1):
child = &tokens[i] child = &tokens[i]
@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
for i in range(length): for i in range(length):
if tokens[i].head == 0 and tokens[i].dep != 0: if tokens[i].head == 0 and tokens[i].dep != 0:
tokens[tokens[i].l_edge].sent_start = True tokens[tokens[i].l_edge].sent_start = True

View File

@ -20,7 +20,7 @@ from .. import parts_of_speech
from ..attrs cimport LEMMA from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT from ..parts_of_speech cimport CCONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET from ..attrs cimport IS_BRACKET
@ -84,7 +84,7 @@ cdef class Token:
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
'''Check the value of a boolean flag. '''Check the value of a boolean flag.
Arguments: Arguments:
flag_id (int): The ID of the flag attribute. flag_id (int): The ID of the flag attribute.
Returns: Returns:
@ -225,7 +225,7 @@ cdef class Token:
property vector: property vector:
''' '''
A real-valued meaning representation. A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32'] Type: numpy.ndarray[ndim=1, dtype='float32']
''' '''
def __get__(self): def __get__(self):
@ -343,7 +343,7 @@ cdef class Token:
''' '''
def __get__(self): def __get__(self):
cdef const TokenC* head_ptr = self.c cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have # guard against infinite loop, no token can have
# more ancestors than tokens in the tree # more ancestors than tokens in the tree
cdef int i = 0 cdef int i = 0
while head_ptr.head != 0 and i < self.doc.length: while head_ptr.head != 0 and i < self.doc.length:
@ -370,7 +370,7 @@ cdef class Token:
property head: property head:
'''The syntactic parent, or "governor", of this token. '''The syntactic parent, or "governor", of this token.
Returns: Token Returns: Token
''' '''
def __get__(self): def __get__(self):
@ -390,7 +390,7 @@ cdef class Token:
# is the new head a descendant of the old head # is the new head a descendant of the old head
cdef bint is_desc = old_head.is_ancestor_of(new_head) cdef bint is_desc = old_head.is_ancestor_of(new_head)
cdef int new_edge cdef int new_edge
cdef Token anc, child cdef Token anc, child
@ -420,7 +420,7 @@ cdef class Token:
if anc.c.l_edge <= new_edge: if anc.c.l_edge <= new_edge:
break break
anc.c.l_edge = new_edge anc.c.l_edge = new_edge
elif self.c.head < 0: # right dependent elif self.c.head < 0: # right dependent
old_head.c.r_kids -= 1 old_head.c.r_kids -= 1
# do the same thing as for l_edge # do the same thing as for l_edge
@ -435,7 +435,7 @@ cdef class Token:
if child.c.r_edge > new_edge: if child.c.r_edge > new_edge:
new_edge = child.c.r_edge new_edge = child.c.r_edge
old_head.c.r_edge = new_edge old_head.c.r_edge = new_edge
for anc in old_head.ancestors: for anc in old_head.ancestors:
if anc.c.r_edge >= new_edge: if anc.c.r_edge >= new_edge:
break break
@ -598,19 +598,19 @@ cdef class Token:
property is_punct: property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space: property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket: property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote: property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct: property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct: property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url: property like_url:

View File

@ -9,11 +9,16 @@ import bz2
import ujson as json import ujson as json
import re import re
try:
import cPickle as pickle
except ImportError:
import pickle
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile from .cfile cimport CFile, StringCFile
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .attrs import intify_attrs from .attrs import intify_attrs
from .tokens.token cimport Token from .tokens.token cimport Token
@ -346,17 +351,18 @@ cdef class Vocab:
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)
return tokens return tokens
def dump(self, loc): def dump(self, loc=None):
"""Save the lexemes binary data to the given location. """Save the lexemes binary data to the given location, or
return a byte-string with the data if loc is None.
Arguments: Arguments:
loc (Path): The path to save to. loc (Path or None): The path to save to, or None.
""" """
if hasattr(loc, 'as_posix'): cdef CFile fp
loc = loc.as_posix() if loc is None:
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc fp = StringCFile('wb')
else:
cdef CFile fp = CFile(bytes_loc, 'wb') fp = CFile(loc, 'wb')
cdef size_t st cdef size_t st
cdef size_t addr cdef size_t addr
cdef hash_t key cdef hash_t key
@ -378,6 +384,8 @@ cdef class Vocab:
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1) fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
fp.close() fp.close()
if loc is None:
return fp.string_data()
def load_lexemes(self, loc): def load_lexemes(self, loc):
'''Load the binary vocabulary data from the given location. '''Load the binary vocabulary data from the given location.
@ -427,6 +435,60 @@ cdef class Vocab:
i += 1 i += 1
fp.close() fp.close()
def _deserialize_lexemes(self, CFile fp):
'''Load the binary vocabulary data from the given CFile.
'''
cdef LexemeC* lexeme
cdef hash_t key
cdef unicode py_str
cdef attr_t orth
assert sizeof(orth) == sizeof(lexeme.orth)
i = 0
cdef int todo = fp.size
cdef int lex_size = sizeof(lexeme.flags)
lex_size += sizeof(lexeme.id)
lex_size += sizeof(lexeme.length)
lex_size += sizeof(lexeme.orth)
lex_size += sizeof(lexeme.lower)
lex_size += sizeof(lexeme.norm)
lex_size += sizeof(lexeme.shape)
lex_size += sizeof(lexeme.prefix)
lex_size += sizeof(lexeme.suffix)
lex_size += sizeof(lexeme.cluster)
lex_size += sizeof(lexeme.prob)
lex_size += sizeof(lexeme.sentiment)
lex_size += sizeof(lexeme.l2_norm)
lex_size += sizeof(lexeme.lang)
while True:
if todo < lex_size:
break
todo -= lex_size
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
# Copy data from the file into the lexeme
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
fp.read_into(&lexeme.length, 1, sizeof(lexeme.length))
fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth))
fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower))
fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm))
fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape))
fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix))
fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix))
fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster))
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth]
key = hash_string(py_str)
self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme)
self.length += 1
i += 1
fp.close()
def dump_vectors(self, out_loc): def dump_vectors(self, out_loc):
'''Save the word vectors to a binary file. '''Save the word vectors to a binary file.
@ -553,6 +615,42 @@ cdef class Vocab:
return vec_len return vec_len
def pickle_vocab(vocab):
sstore = vocab.strings
morph = vocab.morphology
length = vocab.length
serializer = vocab._serializer
data_dir = vocab.data_dir
lex_attr_getters = vocab.lex_attr_getters
lexemes_data = vocab.dump()
vectors_length = vocab.vectors_length
return (unpickle_vocab,
(sstore, morph, serializer, data_dir, lex_attr_getters,
lexemes_data, length, vectors_length))
def unpickle_vocab(sstore, morphology, serializer, data_dir,
lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
cdef Vocab vocab = Vocab()
vocab.length = length
vocab.vectors_length = vectors_length
vocab.strings = sstore
cdef CFile fp = StringCFile('r', data=lexemes_data)
vocab.morphology = morphology
vocab._serializer = serializer
vocab.data_dir = data_dir
vocab.lex_attr_getters = lex_attr_getters
vocab._deserialize_lexemes(fp)
vocab.length = length
vocab.vectors_length = vectors_length
return vocab
copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
def write_binary_vectors(in_loc, out_loc): def write_binary_vectors(in_loc, out_loc):
cdef CFile out_file = CFile(out_loc, 'wb') cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem cdef Address mem