Merge branch 'develop'

This commit is contained in:
Matthew Honnibal 2017-03-10 02:49:39 -06:00
commit ea53647362
29 changed files with 535 additions and 138 deletions

View File

@ -66,8 +66,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg, def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0): n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
with Language.train(model_dir, train_data, with Language.train(model_dir, train_data,
tagger_cfg, parser_cfg, entity_cfg) as trainer: tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0 loss = 0
@ -76,11 +76,13 @@ def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, ent
for doc, gold in epoch: for doc, gold in epoch:
trainer.update(doc, gold) trainer.update(doc, gold)
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc) dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
print(format_str.format(itn, loss, **dev_scores.scores)) print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None, cand_preproc=None): beam_width=None, cand_preproc=None):
print("Load parser", model_dir)
nlp = Language(path=model_dir) nlp = Language(path=model_dir)
if nlp.lang == 'de': if nlp.lang == 'de':
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
@ -145,9 +147,11 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
verbose=("Verbose error reporting", "flag", "v", bool), verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool), debug=("Debug mode", "flag", "d", bool),
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool), pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
L1=("L1 regularization penalty", "option", "L", float),
) )
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
L1=1e-6):
parser_cfg = dict(locals()) parser_cfg = dict(locals())
tagger_cfg = dict(locals()) tagger_cfg = dict(locals())
entity_cfg = dict(locals()) entity_cfg = dict(locals())
@ -160,6 +164,8 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc=
if not eval_only: if not eval_only:
gold_train = list(read_json_file(train_loc)) gold_train = list(read_json_file(train_loc))
gold_dev = list(read_json_file(dev_loc)) gold_dev = list(read_json_file(dev_loc))
if n_sents > 0:
gold_train = gold_train[:n_sents]
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
n_iter=n_iter) n_iter=n_iter)

View File

@ -10,3 +10,4 @@ six
ujson>=1.35 ujson>=1.35
cloudpickle cloudpickle
sputnik>=0.9.2,<0.10.0 sputnik>=0.9.2,<0.10.0
dill>=0.2,<0.3

View File

@ -241,7 +241,8 @@ def setup_package():
'cloudpickle', 'cloudpickle',
'pathlib', 'pathlib',
'sputnik>=0.9.2,<0.10.0', 'sputnik>=0.9.2,<0.10.0',
'ujson>=1.35'], 'ujson>=1.35',
'dill>=0.2,<0.3'],
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', 'Development Status :: 5 - Production/Stable',
'Environment :: Console', 'Environment :: Console',

View File

@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', 'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case'] 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', # U20
]
for key in morph_keys: for key in morph_keys:
if key in stringy_attrs: if key in stringy_attrs:
stringy_attrs.pop(key) stringy_attrs.pop(key)

View File

@ -4,6 +4,20 @@ from cymem.cymem cimport Pool
cdef class CFile: cdef class CFile:
cdef FILE* fp cdef FILE* fp
cdef bint is_open cdef bint is_open
cdef Pool mem
cdef int size # For compatibility with subclass
cdef int _capacity # For compatibility with subclass
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
cdef class StringCFile(CFile):
cdef unsigned char* data
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1

View File

@ -1,4 +1,5 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memcpy
cdef class CFile: cdef class CFile:
@ -9,6 +10,7 @@ cdef class CFile:
mode_str = mode mode_str = mode
if hasattr(loc, 'as_posix'): if hasattr(loc, 'as_posix'):
loc = loc.as_posix() loc = loc.as_posix()
self.mem = Pool()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode_str) self.fp = fopen(<char*>bytes_loc, mode_str)
if self.fp == NULL: if self.fp == NULL:
@ -45,3 +47,42 @@ cdef class CFile:
cdef bytes py_bytes = value.encode('utf8') cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars) self.write(sizeof(char), len(py_bytes), chars)
cdef class StringCFile:
def __init__(self, mode, bytes data=b'', on_open_error=None):
self.mem = Pool()
self.is_open = 'w' in mode
self._capacity = max(len(data), 8)
self.size = len(data)
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
for i in range(len(data)):
self.data[i] = data[i]
def close(self):
self.is_open = False
def string_data(self):
return (self.data-self.size)[:self.size]
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
memcpy(dest, self.data, elem_size * number)
self.data += elem_size * number
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
write_size = number * elem_size
if (self.size + write_size) >= self._capacity:
self._capacity = (self.size + write_size) * 2
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
memcpy(&self.data[self.size], src, elem_size * number)
self.size += write_size
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -41,7 +41,7 @@ TAG_MAP = {
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {POS: PART}, "PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"}, "PTKANT": {POS: PART, "PartType": "res"},
"PTKNEG": {POS: PART, "Negative": "yes"}, "PTKNEG": {POS: PART, "Polarity": "Neg"},
"PTKVZ": {POS: PART, "PartType": "vbp"}, "PTKVZ": {POS: PART, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"}, "PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"}, "PWAT": {POS: DET, "PronType": "int"},

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from os import path from os import path
from pathlib import Path
from ..util import match_best_version from ..util import match_best_version
from ..util import get_data_path from ..util import get_data_path
@ -13,6 +14,11 @@ from ..attrs import LANG
from .language_data import * from .language_data import *
try:
basestring
except NameError:
basestring = str
class English(Language): class English(Language):
lang = 'en' lang = 'en'
@ -43,14 +49,15 @@ def _fix_deprecated_glove_vectors_loading(overrides):
data_path = get_data_path() data_path = get_data_path()
else: else:
path = overrides['path'] path = overrides['path']
if isinstance(path, basestring):
path = Path(path)
data_path = path.parent data_path = path.parent
vec_path = None vec_path = None
if 'add_vectors' not in overrides: if 'add_vectors' not in overrides:
if 'vectors' in overrides: if 'vectors' in overrides:
vec_path = match_best_version(overrides['vectors'], None, data_path) vec_path = match_best_version(overrides['vectors'], None, data_path)
if vec_path is None: if vec_path is None:
raise IOError( return overrides
'Could not load data pack %s from %s' % (overrides['vectors'], data_path))
else: else:
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path) vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
if vec_path is not None: if vec_path is not None:

View File

@ -16,7 +16,7 @@ TAG_MAP = {
"$": {POS: SYM, "Other": {"SymType": "currency"}}, "$": {POS: SYM, "Other": {"SymType": "currency"}},
"#": {POS: SYM, "Other": {"SymType": "numbersign"}}, "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
"AFX": {POS: ADJ, "Hyph": "yes"}, "AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CONJ, "ConjType": "coor"}, "CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"}, "CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET}, "DT": {POS: DET},
"EX": {POS: ADV, "AdvType": "ex"}, "EX": {POS: ADV, "AdvType": "ex"},

View File

@ -5,7 +5,7 @@ import pathlib
from contextlib import contextmanager from contextlib import contextmanager
import shutil import shutil
import ujson as json import ujson
try: try:
@ -13,6 +13,10 @@ try:
except NameError: except NameError:
basestring = str basestring = str
try:
unicode
except NameError:
unicode = str
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
@ -226,12 +230,21 @@ class Language(object):
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
with (dep_model_dir / 'config.json').open('w') as file_: with (dep_model_dir / 'config.json').open('wb') as file_:
json.dump(parser_cfg, file_) data = ujson.dumps(parser_cfg)
with (ner_model_dir / 'config.json').open('w') as file_: if isinstance(data, unicode):
json.dump(entity_cfg, file_) data = data.encode('utf8')
with (pos_model_dir / 'config.json').open('w') as file_: file_.write(data)
json.dump(tagger_cfg, file_) with (ner_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(entity_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
with (pos_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(tagger_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
self = cls( self = cls(
path=path, path=path,
@ -252,7 +265,7 @@ class Language(object):
self.entity = self.Defaults.create_entity(self) self.entity = self.Defaults.create_entity(self)
self.pipeline = self.Defaults.create_pipeline(self) self.pipeline = self.Defaults.create_pipeline(self)
yield Trainer(self, gold_tuples) yield Trainer(self, gold_tuples)
self.end_training() self.end_training(path=path)
def __init__(self, **overrides): def __init__(self, **overrides):
if 'data_dir' in overrides and 'path' not in overrides: if 'data_dir' in overrides and 'path' not in overrides:
@ -391,12 +404,14 @@ class Language(object):
else: else:
entity_iob_freqs = [] entity_iob_freqs = []
entity_type_freqs = [] entity_type_freqs = []
with (path / 'vocab' / 'serializer.json').open('w') as file_: with (path / 'vocab' / 'serializer.json').open('wb') as file_:
file_.write( data = ujson.dumps([
json.dumps([
(TAG, tagger_freqs), (TAG, tagger_freqs),
(DEP, dep_freqs), (DEP, dep_freqs),
(ENT_IOB, entity_iob_freqs), (ENT_IOB, entity_iob_freqs),
(ENT_TYPE, entity_type_freqs), (ENT_TYPE, entity_type_freqs),
(HEAD, head_freqs) (HEAD, head_freqs)
])) ])
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)

View File

@ -19,6 +19,7 @@ TAG_MAP = {
"AUX": {POS: AUX}, "AUX": {POS: AUX},
"X": {POS: X}, "X": {POS: X},
"CONJ": {POS: CONJ}, "CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ}, # U20
"ADJ": {POS: ADJ}, "ADJ": {POS: ADJ},
"VERB": {POS: VERB}, "VERB": {POS: VERB},
"PART": {POS: PART} "PART": {POS: PART}

View File

@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
Definite_two Definite_two
Definite_def Definite_def
Definite_red Definite_red
Definite_cons # U20
Definite_ind Definite_ind
Degree_cmp Degree_cmp
Degree_comp Degree_comp
@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
Negative_neg Negative_neg
Negative_pos Negative_pos
Negative_yes Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com Number_com
Number_dual Number_dual
Number_none Number_none
@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
VerbForm_partPres VerbForm_partPres
VerbForm_sup VerbForm_sup
VerbForm_trans VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la VerbForm_gdv # la
Voice_act Voice_act
Voice_cau Voice_cau

View File

@ -192,6 +192,7 @@ IDS = {
"Definite_two": Definite_two, "Definite_two": Definite_two,
"Definite_def": Definite_def, "Definite_def": Definite_def,
"Definite_red": Definite_red, "Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind, "Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp, "Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp, "Degree_comp": Degree_comp,
@ -215,6 +216,8 @@ IDS = {
"Negative_neg": Negative_neg, "Negative_neg": Negative_neg,
"Negative_pos": Negative_pos, "Negative_pos": Negative_pos,
"Negative_yes": Negative_yes, "Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com, "Number_com": Number_com,
"Number_dual": Number_dual, "Number_dual": Number_dual,
"Number_none": Number_none, "Number_none": Number_none,
@ -263,6 +266,7 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres, "VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup, "VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans, "VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la, "VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act, "Voice_act": Voice_act,
"Voice_cau": Voice_cau, "Voice_cau": Voice_cau,

View File

@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
ADV ADV
AUX AUX
CONJ CONJ
CCONJ # U20
DET DET
INTJ INTJ
NOUN NOUN

View File

@ -7,7 +7,8 @@ IDS = {
"ADP": ADP, "ADP": ADP,
"ADV": ADV, "ADV": ADV,
"AUX": AUX, "AUX": AUX,
"CONJ": CONJ, "CONJ": CONJ, # U20
"CCONJ": CCONJ,
"DET": DET, "DET": DET,
"INTJ": INTJ, "INTJ": INTJ,
"NOUN": NOUN, "NOUN": NOUN,

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals, absolute_import
cimport cython cimport cython
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
@ -12,22 +12,19 @@ from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
try: import ujson
import ujson as json
except ImportError:
import json
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
chars = string.encode('utf8') chars = string.encode('utf8')
return _hash_utf8(chars, len(chars)) return hash_utf8(chars, len(chars))
cdef hash_t _hash_utf8(char* utf8_string, int length): cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
return hash64(utf8_string, length, 1) return hash64(utf8_string, length, 1)
cdef uint32_t _hash32_utf8(char* utf8_string, int length): cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1) return hash32(utf8_string, length, 1)
@ -48,11 +45,11 @@ cdef unicode _decode(const Utf8Str* string):
return string.p[i:length + i].decode('utf8') return string.p[i:length + i].decode('utf8')
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *: cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef int n_length_bytes cdef int n_length_bytes
cdef int i cdef int i
cdef Utf8Str string cdef Utf8Str string
assert length != 0 cdef uint32_t ulength = length
if length < sizeof(string.s): if length < sizeof(string.s):
string.s[0] = <unsigned char>length string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length) memcpy(&string.s[1], chars, length)
@ -98,6 +95,14 @@ cdef class StringStore:
def __get__(self): def __get__(self):
return self.size -1 return self.size -1
def __reduce__(self):
# TODO: OOV words, for the is_frozen stuff?
if self.is_frozen:
raise NotImplementedError(
"Currently missing support for pickling StringStore when "
"is_frozen=True")
return (StringStore, (list(self),))
def __len__(self): def __len__(self):
"""The number of strings in the store. """The number of strings in the store.
@ -149,7 +154,7 @@ cdef class StringStore:
# pretty bad. # pretty bad.
# We could also get unlucky here, and hash into a value that # We could also get unlucky here, and hash into a value that
# collides with the 'real' strings. # collides with the 'real' strings.
return _hash32_utf8(byte_string, len(byte_string)) return hash32_utf8(byte_string, len(byte_string))
else: else:
return utf8str - self.c return utf8str - self.c
@ -200,7 +205,7 @@ cdef class StringStore:
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length): cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
# TODO: This function's API/behaviour is an unholy mess... # TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = _hash_utf8(utf8_string, length) cdef hash_t key = hash_utf8(utf8_string, length)
cdef Utf8Str* value = <Utf8Str*>self._map.get(key) cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL: if value is not NULL:
return value return value
@ -209,7 +214,7 @@ cdef class StringStore:
return value return value
if self.is_frozen: if self.is_frozen:
# OOV store uses 32 bit hashes. Pretty ugly :( # OOV store uses 32 bit hashes. Pretty ugly :(
key32 = _hash32_utf8(utf8_string, length) key32 = hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial # Important: Make the OOV store own the memory. That way it's trivial
# to flush them all. # to flush them all.
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str)) value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
@ -232,7 +237,7 @@ cdef class StringStore:
Returns: Returns:
None None
""" """
string_data = json.dumps(list(self)) string_data = ujson.dumps(list(self))
if not isinstance(string_data, unicode): if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8') string_data = string_data.decode('utf8')
# TODO: OOV? # TODO: OOV?
@ -246,7 +251,7 @@ cdef class StringStore:
Returns: Returns:
None None
""" """
strings = json.load(file_) strings = ujson.load(file_)
if strings == ['']: if strings == ['']:
return None return None
cdef unicode string cdef unicode string

View File

@ -90,6 +90,7 @@ cpdef enum symbol_t:
ADV ADV
AUX AUX
CONJ CONJ
CCONJ # U20
DET DET
INTJ INTJ
NOUN NOUN
@ -107,11 +108,14 @@ cpdef enum symbol_t:
Animacy_anim Animacy_anim
Animacy_inam Animacy_inam
Animacy_hum # U20
Aspect_freq Aspect_freq
Aspect_imp Aspect_imp
Aspect_mod Aspect_mod
Aspect_none Aspect_none
Aspect_perf Aspect_perf
Aspect_iter # U20
Aspect_hab # U20
Case_abe Case_abe
Case_abl Case_abl
Case_abs Case_abs
@ -120,10 +124,12 @@ cpdef enum symbol_t:
Case_all Case_all
Case_cau Case_cau
Case_com Case_com
Case_cmp # U20
Case_dat Case_dat
Case_del Case_del
Case_dis Case_dis
Case_ela Case_ela
Case_equ # U20
Case_ess Case_ess
Case_gen Case_gen
Case_ill Case_ill
@ -142,7 +148,9 @@ cpdef enum symbol_t:
Definite_two Definite_two
Definite_def Definite_def
Definite_red Definite_red
Definite_cons # U20
Definite_ind Definite_ind
Definite_spec # U20
Degree_cmp Degree_cmp
Degree_comp Degree_comp
Degree_none Degree_none
@ -151,6 +159,8 @@ cpdef enum symbol_t:
Degree_abs Degree_abs
Degree_com Degree_com
Degree_dim # du Degree_dim # du
Degree_equ # U20
Evident_nfh # U20
Gender_com Gender_com
Gender_fem Gender_fem
Gender_masc Gender_masc
@ -162,16 +172,21 @@ cpdef enum symbol_t:
Mood_pot Mood_pot
Mood_sub Mood_sub
Mood_opt Mood_opt
Mood_prp # U20
Mood_adm # U20
Negative_neg Negative_neg
Negative_pos Negative_pos
Negative_yes Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com Number_com
Number_dual Number_dual
Number_none Number_none
Number_plur Number_plur
Number_sing Number_sing
Number_ptan # bg Number_ptan # bg
Number_count # bg Number_count # bg, U20
Number_tri # U20
NumType_card NumType_card
NumType_dist NumType_dist
NumType_frac NumType_frac
@ -197,7 +212,8 @@ cpdef enum symbol_t:
PronType_rel PronType_rel
PronType_tot PronType_tot
PronType_clit PronType_clit
PronType_exc # es, ca, it, fa PronType_exc # es, ca, it, fa, U20
PronType_emp # U20
Reflex_yes Reflex_yes
Tense_fut Tense_fut
Tense_imp Tense_imp
@ -213,12 +229,17 @@ cpdef enum symbol_t:
VerbForm_partPres VerbForm_partPres
VerbForm_sup VerbForm_sup
VerbForm_trans VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la VerbForm_gdv # la
VerbForm_vnoun # U20
Voice_act Voice_act
Voice_cau Voice_cau
Voice_pass Voice_pass
Voice_mid # gkc Voice_mid # gkc, U20
Voice_int # hb Voice_int # hb
Voice_antip # U20
Voice_dir # U20
Voice_inv # U20
Abbr_yes # cz, fi, sl, U Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U AdpType_prep # cz, U
AdpType_post # U AdpType_post # U
@ -284,6 +305,10 @@ cpdef enum symbol_t:
Number_psee_plur # U Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U Number_psor_plur # cz, fi, sl, U
Number_pauc # U20
Number_grpa # U20
Number_grpl # U20
Number_inv # U20
NumForm_digit # cz, sl, U NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U NumForm_word # cz, sl, U
@ -311,6 +336,8 @@ cpdef enum symbol_t:
Person_psor_one # fi, U Person_psor_one # fi, U
Person_psor_two # fi, U Person_psor_two # fi, U
Person_psor_three # fi, U Person_psor_three # fi, U
Person_zero # U20
Person_four # U20
Polite_inf # bq, U Polite_inf # bq, U
Polite_pol # bq, U Polite_pol # bq, U
Polite_abs_inf # bq, U Polite_abs_inf # bq, U
@ -319,6 +346,10 @@ cpdef enum symbol_t:
Polite_erg_pol # bq, U Polite_erg_pol # bq, U
Polite_dat_inf # bq, U Polite_dat_inf # bq, U
Polite_dat_pol # bq, U Polite_dat_pol # bq, U
Polite_infm # U20
Polite_form # U20
Polite_form_elev # U20
Polite_form_humb # U20
Prefix_yes # U Prefix_yes # U
PrepCase_npr # cz PrepCase_npr # cz
PrepCase_pre # U PrepCase_pre # U
@ -383,6 +414,7 @@ cpdef enum symbol_t:
ccomp ccomp
complm complm
conj conj
cop # U20
csubj csubj
csubjpass csubjpass
dep dep
@ -405,6 +437,8 @@ cpdef enum symbol_t:
num num
number number
oprd oprd
obj # U20
obl # U20
parataxis parataxis
partmod partmod
pcomp pcomp

View File

@ -91,6 +91,7 @@ IDS = {
"ADV": ADV, "ADV": ADV,
"AUX": AUX, "AUX": AUX,
"CONJ": CONJ, "CONJ": CONJ,
"CCONJ": CCONJ, # U20
"DET": DET, "DET": DET,
"INTJ": INTJ, "INTJ": INTJ,
"NOUN": NOUN, "NOUN": NOUN,
@ -108,11 +109,14 @@ IDS = {
"Animacy_anim": Animacy_anim, "Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam, "Animacy_inam": Animacy_inam,
"Animacy_hum": Animacy_hum, # U20
"Aspect_freq": Aspect_freq, "Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp, "Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod, "Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none, "Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf, "Aspect_perf": Aspect_perf,
"Aspect_iter": Aspect_iter, # U20
"Aspect_hab": Aspect_hab, # U20
"Case_abe": Case_abe, "Case_abe": Case_abe,
"Case_abl": Case_abl, "Case_abl": Case_abl,
"Case_abs": Case_abs, "Case_abs": Case_abs,
@ -121,10 +125,12 @@ IDS = {
"Case_all": Case_all, "Case_all": Case_all,
"Case_cau": Case_cau, "Case_cau": Case_cau,
"Case_com": Case_com, "Case_com": Case_com,
"Case_cmp": Case_cmp, # U20
"Case_dat": Case_dat, "Case_dat": Case_dat,
"Case_del": Case_del, "Case_del": Case_del,
"Case_dis": Case_dis, "Case_dis": Case_dis,
"Case_ela": Case_ela, "Case_ela": Case_ela,
"Case_equ": Case_equ, # U20
"Case_ess": Case_ess, "Case_ess": Case_ess,
"Case_gen": Case_gen, "Case_gen": Case_gen,
"Case_ill": Case_ill, "Case_ill": Case_ill,
@ -143,7 +149,9 @@ IDS = {
"Definite_two": Definite_two, "Definite_two": Definite_two,
"Definite_def": Definite_def, "Definite_def": Definite_def,
"Definite_red": Definite_red, "Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind, "Definite_ind": Definite_ind,
"Definite_spec": Definite_spec, # U20
"Degree_cmp": Degree_cmp, "Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp, "Degree_comp": Degree_comp,
"Degree_none": Degree_none, "Degree_none": Degree_none,
@ -152,6 +160,8 @@ IDS = {
"Degree_abs": Degree_abs, "Degree_abs": Degree_abs,
"Degree_com": Degree_com, "Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du "Degree_dim ": Degree_dim, # du
"Degree_equ": Degree_equ, # U20
"Evident_nfh": Evident_nfh, # U20
"Gender_com": Gender_com, "Gender_com": Gender_com,
"Gender_fem": Gender_fem, "Gender_fem": Gender_fem,
"Gender_masc": Gender_masc, "Gender_masc": Gender_masc,
@ -163,16 +173,21 @@ IDS = {
"Mood_pot": Mood_pot, "Mood_pot": Mood_pot,
"Mood_sub": Mood_sub, "Mood_sub": Mood_sub,
"Mood_opt": Mood_opt, "Mood_opt": Mood_opt,
"Mood_prp": Mood_prp, # U20
"Mood_adm": Mood_adm, # U20
"Negative_neg": Negative_neg, "Negative_neg": Negative_neg,
"Negative_pos": Negative_pos, "Negative_pos": Negative_pos,
"Negative_yes": Negative_yes, "Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com, "Number_com": Number_com,
"Number_dual": Number_dual, "Number_dual": Number_dual,
"Number_none": Number_none, "Number_none": Number_none,
"Number_plur": Number_plur, "Number_plur": Number_plur,
"Number_sing": Number_sing, "Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg "Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg "Number_count ": Number_count, # bg, U20
"Number_tri": Number_tri, # U20
"NumType_card": NumType_card, "NumType_card": NumType_card,
"NumType_dist": NumType_dist, "NumType_dist": NumType_dist,
"NumType_frac": NumType_frac, "NumType_frac": NumType_frac,
@ -198,7 +213,8 @@ IDS = {
"PronType_rel": PronType_rel, "PronType_rel": PronType_rel,
"PronType_tot": PronType_tot, "PronType_tot": PronType_tot,
"PronType_clit": PronType_clit, "PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa, "PronType_exc": PronType_exc, # es, ca, it, fa, U20
"PronType_emp": PronType_emp, # U20
"Reflex_yes": Reflex_yes, "Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut, "Tense_fut": Tense_fut,
"Tense_imp": Tense_imp, "Tense_imp": Tense_imp,
@ -214,12 +230,17 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres, "VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup, "VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans, "VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la, "VerbForm_gdv ": VerbForm_gdv, # la,
"VerbForm_vnoun": VerbForm_vnoun, # U20
"Voice_act": Voice_act, "Voice_act": Voice_act,
"Voice_cau": Voice_cau, "Voice_cau": Voice_cau,
"Voice_pass": Voice_pass, "Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc, "Voice_mid ": Voice_mid, # gkc, U20
"Voice_int ": Voice_int, # hb, "Voice_int ": Voice_int, # hb,
"Voice_antip": Voice_antip, # U20
"Voice_dir": Voice_dir, # U20
"Voice_inv": Voice_inv, # U20
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U, "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U, "AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U, "AdpType_post ": AdpType_post, # U,
@ -285,6 +306,10 @@ IDS = {
"Number_psee_plur ": Number_psee_plur, # U, "Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"Number_pauc": Number_pauc, # U20
"Number_grpa": Number_grpa, # U20
"Number_grpl": Number_grpl, # U20
"Number_inv": Number_inv, # U20
"NumForm_digit ": NumForm_digit, # cz, sl, U, "NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U, "NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U, "NumForm_word ": NumForm_word, # cz, sl, U,
@ -312,6 +337,8 @@ IDS = {
"Person_psor_one ": Person_psor_one, # fi, U, "Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U, "Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U, "Person_psor_three ": Person_psor_three, # fi, U,
"Person_zero ": Person_zero, # U20
"Person_four ": Person_four, # U20
"Polite_inf ": Polite_inf, # bq, U, "Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U, "Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U, "Polite_abs_inf ": Polite_abs_inf, # bq, U,
@ -320,6 +347,10 @@ IDS = {
"Polite_erg_pol ": Polite_erg_pol, # bq, U, "Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U, "Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U, "Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Polite_infm ": Polite_infm, # U20
"Polite_form ": Polite_form, # U20
"Polite_form_elev ": Polite_form_elev, # U20
"Polite_form_humb ": Polite_form_humb, # U20
"Prefix_yes ": Prefix_yes, # U, "Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz, "PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U, "PrepCase_pre ": PrepCase_pre, # U,
@ -384,6 +415,7 @@ IDS = {
"ccomp": ccomp, "ccomp": ccomp,
"complm": complm, "complm": complm,
"conj": conj, "conj": conj,
"cop": cop, # U20
"csubj": csubj, "csubj": csubj,
"csubjpass": csubjpass, "csubjpass": csubjpass,
"dep": dep, "dep": dep,
@ -406,6 +438,8 @@ IDS = {
"num": num, "num": num,
"number": number, "number": number,
"oprd": oprd, "oprd": oprd,
"obj": obj, # U20
"obl": obl, # U20
"parataxis": parataxis, "parataxis": parataxis,
"partmod": partmod, "partmod": partmod,
"pcomp": pcomp, "pcomp": pcomp,

View File

@ -124,6 +124,8 @@ cdef class Parser:
elif 'features' not in cfg: elif 'features' not in cfg:
cfg['features'] = self.feature_templates cfg['features'] = self.feature_templates
self.model = ParserModel(cfg['features']) self.model = ParserModel(cfg['features'])
self.model.l1_penalty = cfg.get('L1', 0.0)
self.cfg = cfg self.cfg = cfg
def __reduce__(self): def __reduce__(self):
@ -258,15 +260,20 @@ cdef class Parser:
self.model.set_featuresC(&eg.c, stcls.c) self.model.set_featuresC(&eg.c, stcls.c)
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
self.model.updateC(&eg.c) self.model.time += 1
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
if eg.c.costs[guess] > 0:
best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
for feat in eg.c.features[:eg.c.nr_feat]:
self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.c.costs[guess])
self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.c.costs[guess])
action = self.moves.c[eg.guess] action = self.moves.c[guess]
action.do(stcls.c, action.label) action.do(stcls.c, action.label)
loss += eg.costs[eg.guess] loss += eg.costs[guess]
eg.fill_scores(0, eg.nr_class) eg.fill_scores(0, eg.c.nr_class)
eg.fill_costs(0, eg.nr_class) eg.fill_costs(0, eg.c.nr_class)
eg.fill_is_valid(1, eg.nr_class) eg.fill_is_valid(1, eg.c.nr_class)
return loss return loss
def step_through(self, Doc doc): def step_through(self, Doc doc):
@ -385,6 +392,14 @@ class ParserStateError(ValueError):
"Please include the text that the parser failed on, which is:\n" "Please include the text that the parser failed on, which is:\n"
"%s" % repr(doc.text)) "%s" % repr(doc.text))
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil:
cdef int best = -1
for i in range(n):
if costs[i] <= 0:
if best == -1 or scores[i] > scores[best]:
best = i
return best
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
int nr_class) except -1: int nr_class) except -1:

View File

@ -13,7 +13,7 @@ from thinc.linalg cimport VecVec
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .gold cimport GoldParse from .gold cimport GoldParse
@ -152,6 +152,7 @@ cdef class Tagger:
model = TaggerModel(cfg.get('features', self.feature_templates)) model = TaggerModel(cfg.get('features', self.feature_templates))
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.model.l1_penalty = 0.0
# TODO: Move this to tag map # TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)} self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names: for tag in self.tag_names:

View File

@ -4,9 +4,15 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text): def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after parsing.""" """Test for Issue #792: Trailing whitespace is removed after tokenization."""
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert doc.text_with_ws == text assert ''.join([token.text_with_ws for token in doc]) == text
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text):
"""Test base case for Issue #792: Non-trailing whitespace"""
doc = en_tokenizer(text)
assert ''.join([token.text_with_ws for token in doc]) == text

View File

@ -0,0 +1,52 @@
'''
Test Matcher matches with '*' operator and Boolean flag
'''
from __future__ import unicode_literals
from __future__ import print_function
import pytest
from ...matcher import Matcher
from ...vocab import Vocab
from ...attrs import LOWER
from ...tokens import Doc
def test_basic_case():
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern(
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', LOWER: 'and'},
{LOWER: 'frank'}
])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc)
assert len(match) == 1
ent_id, label, start, end = match[0]
assert start == 0
assert end == 4
@pytest.mark.xfail
def test_issue850():
'''The problem here is that the variable-length pattern matches the
succeeding token. We then don't handle the ambiguity correctly.'''
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern(
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', IS_ANY_TOKEN: True},
{LOWER: 'frank'}
])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc)
assert len(match) == 1
ent_id, label, start, end = match[0]
assert start == 0
assert end == 4

View File

@ -0,0 +1,12 @@
# encoding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
"aaabbb@ccc.com \nThank you!"])
def test_issue859(en_tokenizer, text):
"""Test that no extra space is added in doc.text method."""
doc = en_tokenizer(text)
assert doc.text == text

View File

@ -0,0 +1,40 @@
from __future__ import unicode_literals
import io
import pytest
import dill as pickle
from ..strings import StringStore
from ..vocab import Vocab
from ..attrs import NORM
def test_pickle_string_store():
sstore = StringStore()
hello = sstore['hello']
bye = sstore['bye']
bdata = pickle.dumps(sstore, protocol=-1)
unpickled = pickle.loads(bdata)
assert unpickled['hello'] == hello
assert unpickled['bye'] == bye
assert len(sstore) == len(unpickled)
@pytest.mark.xfail
def test_pickle_vocab():
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
dog = vocab[u'dog']
cat = vocab[u'cat']
assert dog.norm_ == 'do'
assert cat.norm_ == 'ca'
bdata = pickle.dumps(vocab)
unpickled = pickle.loads(bdata)
assert unpickled[u'dog'].orth == dog.orth
assert unpickled[u'cat'].orth == cat.orth
assert unpickled[u'dog'].norm == dog.norm
assert unpickled[u'cat'].norm == cat.norm
dog_ = unpickled[u'dog']
cat_ = unpickled[u'cat']
assert dog_.norm != cat_.norm

View File

@ -163,7 +163,6 @@ cdef class Tokenizer:
start = i start = i
in_ws = not in_ws in_ws = not in_ws
i += 1 i += 1
i += 1
if start < i: if start < i:
span = string[start:] span = string[start:]
key = hash_string(span) key = hash_string(span)
@ -275,7 +274,10 @@ cdef class Tokenizer:
if cache_hit: if cache_hit:
pass pass
elif self.token_match and self.token_match(string): elif self.token_match and self.token_match(string):
tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else: else:
matches = self.find_infix(string) matches = self.find_infix(string)
if not matches: if not matches:

View File

@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .span cimport Span from .span cimport Span

View File

@ -20,7 +20,7 @@ from .. import parts_of_speech
from ..attrs cimport LEMMA from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT from ..parts_of_speech cimport CCONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET from ..attrs cimport IS_BRACKET

View File

@ -9,11 +9,16 @@ import bz2
import ujson as json import ujson as json
import re import re
try:
import cPickle as pickle
except ImportError:
import pickle
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile from .cfile cimport CFile, StringCFile
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .attrs import intify_attrs from .attrs import intify_attrs
from .tokens.token cimport Token from .tokens.token cimport Token
@ -346,17 +351,18 @@ cdef class Vocab:
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)
return tokens return tokens
def dump(self, loc): def dump(self, loc=None):
"""Save the lexemes binary data to the given location. """Save the lexemes binary data to the given location, or
return a byte-string with the data if loc is None.
Arguments: Arguments:
loc (Path): The path to save to. loc (Path or None): The path to save to, or None.
""" """
if hasattr(loc, 'as_posix'): cdef CFile fp
loc = loc.as_posix() if loc is None:
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc fp = StringCFile('wb')
else:
cdef CFile fp = CFile(bytes_loc, 'wb') fp = CFile(loc, 'wb')
cdef size_t st cdef size_t st
cdef size_t addr cdef size_t addr
cdef hash_t key cdef hash_t key
@ -378,6 +384,8 @@ cdef class Vocab:
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1) fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
fp.close() fp.close()
if loc is None:
return fp.string_data()
def load_lexemes(self, loc): def load_lexemes(self, loc):
'''Load the binary vocabulary data from the given location. '''Load the binary vocabulary data from the given location.
@ -427,6 +435,60 @@ cdef class Vocab:
i += 1 i += 1
fp.close() fp.close()
def _deserialize_lexemes(self, CFile fp):
'''Load the binary vocabulary data from the given CFile.
'''
cdef LexemeC* lexeme
cdef hash_t key
cdef unicode py_str
cdef attr_t orth
assert sizeof(orth) == sizeof(lexeme.orth)
i = 0
cdef int todo = fp.size
cdef int lex_size = sizeof(lexeme.flags)
lex_size += sizeof(lexeme.id)
lex_size += sizeof(lexeme.length)
lex_size += sizeof(lexeme.orth)
lex_size += sizeof(lexeme.lower)
lex_size += sizeof(lexeme.norm)
lex_size += sizeof(lexeme.shape)
lex_size += sizeof(lexeme.prefix)
lex_size += sizeof(lexeme.suffix)
lex_size += sizeof(lexeme.cluster)
lex_size += sizeof(lexeme.prob)
lex_size += sizeof(lexeme.sentiment)
lex_size += sizeof(lexeme.l2_norm)
lex_size += sizeof(lexeme.lang)
while True:
if todo < lex_size:
break
todo -= lex_size
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
# Copy data from the file into the lexeme
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
fp.read_into(&lexeme.length, 1, sizeof(lexeme.length))
fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth))
fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower))
fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm))
fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape))
fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix))
fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix))
fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster))
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth]
key = hash_string(py_str)
self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme)
self.length += 1
i += 1
fp.close()
def dump_vectors(self, out_loc): def dump_vectors(self, out_loc):
'''Save the word vectors to a binary file. '''Save the word vectors to a binary file.
@ -553,6 +615,42 @@ cdef class Vocab:
return vec_len return vec_len
def pickle_vocab(vocab):
sstore = vocab.strings
morph = vocab.morphology
length = vocab.length
serializer = vocab._serializer
data_dir = vocab.data_dir
lex_attr_getters = vocab.lex_attr_getters
lexemes_data = vocab.dump()
vectors_length = vocab.vectors_length
return (unpickle_vocab,
(sstore, morph, serializer, data_dir, lex_attr_getters,
lexemes_data, length, vectors_length))
def unpickle_vocab(sstore, morphology, serializer, data_dir,
lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
cdef Vocab vocab = Vocab()
vocab.length = length
vocab.vectors_length = vectors_length
vocab.strings = sstore
cdef CFile fp = StringCFile('r', data=lexemes_data)
vocab.morphology = morphology
vocab._serializer = serializer
vocab.data_dir = data_dir
vocab.lex_attr_getters = lex_attr_getters
vocab._deserialize_lexemes(fp)
vocab.length = length
vocab.vectors_length = vectors_length
return vocab
copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
def write_binary_vectors(in_loc, out_loc): def write_binary_vectors(in_loc, out_loc):
cdef CFile out_file = CFile(out_loc, 'wb') cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem cdef Address mem