Merge branch 'develop'

This commit is contained in:
Matthew Honnibal 2017-03-10 02:49:39 -06:00
commit ea53647362
29 changed files with 535 additions and 138 deletions

View File

@ -66,8 +66,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
with Language.train(model_dir, train_data,
tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0
@ -76,11 +76,13 @@ def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, ent
for doc, gold in epoch:
trainer.update(doc, gold)
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
print(format_str.format(itn, loss, **dev_scores.scores))
print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None, cand_preproc=None):
print("Load parser", model_dir)
nlp = Language(path=model_dir)
if nlp.lang == 'de':
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
@ -145,21 +147,25 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool),
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
L1=("L1 regularization penalty", "option", "L", float),
)
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
L1=1e-6):
parser_cfg = dict(locals())
tagger_cfg = dict(locals())
entity_cfg = dict(locals())
lang = spacy.util.get_lang_class(language)
parser_cfg['features'] = lang.Defaults.parser_features
entity_cfg['features'] = lang.Defaults.entity_features
if not eval_only:
gold_train = list(read_json_file(train_loc))
gold_dev = list(read_json_file(dev_loc))
if n_sents > 0:
gold_train = gold_train[:n_sents]
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
n_iter=n_iter)

View File

@ -10,3 +10,4 @@ six
ujson>=1.35
cloudpickle
sputnik>=0.9.2,<0.10.0
dill>=0.2,<0.3

View File

@ -241,7 +241,8 @@ def setup_package():
'cloudpickle',
'pathlib',
'sputnik>=0.9.2,<0.10.0',
'ujson>=1.35'],
'ujson>=1.35',
'dill>=0.2,<0.3'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Environment :: Console',

View File

@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'''Normalize a dictionary of attributes, converting them to ints.
Arguments:
stringy_attrs (dict):
Dictionary keyed by attribute string names. Values can be ints or strings.
@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case']
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', # U20
]
for key in morph_keys:
if key in stringy_attrs:
stringy_attrs.pop(key)

View File

@ -4,6 +4,20 @@ from cymem.cymem cimport Pool
cdef class CFile:
cdef FILE* fp
cdef bint is_open
cdef Pool mem
cdef int size # For compatibility with subclass
cdef int _capacity # For compatibility with subclass
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
cdef class StringCFile(CFile):
cdef unsigned char* data
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1

View File

@ -1,4 +1,5 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memcpy
cdef class CFile:
@ -9,6 +10,7 @@ cdef class CFile:
mode_str = mode
if hasattr(loc, 'as_posix'):
loc = loc.as_posix()
self.mem = Pool()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode_str)
if self.fp == NULL:
@ -45,3 +47,42 @@ cdef class CFile:
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)
cdef class StringCFile:
def __init__(self, mode, bytes data=b'', on_open_error=None):
self.mem = Pool()
self.is_open = 'w' in mode
self._capacity = max(len(data), 8)
self.size = len(data)
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
for i in range(len(data)):
self.data[i] = data[i]
def close(self):
self.is_open = False
def string_data(self):
return (self.data-self.size)[:self.size]
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
memcpy(dest, self.data, elem_size * number)
self.data += elem_size * number
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
write_size = number * elem_size
if (self.size + write_size) >= self._capacity:
self._capacity = (self.size + write_size) * 2
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
memcpy(&self.data[self.size], src, elem_size * number)
self.size += write_size
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -41,7 +41,7 @@ TAG_MAP = {
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"},
"PTKNEG": {POS: PART, "Negative": "yes"},
"PTKNEG": {POS: PART, "Polarity": "Neg"},
"PTKVZ": {POS: PART, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"},

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals, print_function
from os import path
from pathlib import Path
from ..util import match_best_version
from ..util import get_data_path
@ -13,6 +14,11 @@ from ..attrs import LANG
from .language_data import *
try:
basestring
except NameError:
basestring = str
class English(Language):
lang = 'en'
@ -43,14 +49,15 @@ def _fix_deprecated_glove_vectors_loading(overrides):
data_path = get_data_path()
else:
path = overrides['path']
if isinstance(path, basestring):
path = Path(path)
data_path = path.parent
vec_path = None
if 'add_vectors' not in overrides:
if 'vectors' in overrides:
vec_path = match_best_version(overrides['vectors'], None, data_path)
if vec_path is None:
raise IOError(
'Could not load data pack %s from %s' % (overrides['vectors'], data_path))
return overrides
else:
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
if vec_path is not None:

View File

@ -16,7 +16,7 @@ TAG_MAP = {
"$": {POS: SYM, "Other": {"SymType": "currency"}},
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
"AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CONJ, "ConjType": "coor"},
"CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET},
"EX": {POS: ADV, "AdvType": "ex"},

View File

@ -5,7 +5,7 @@ import pathlib
from contextlib import contextmanager
import shutil
import ujson as json
import ujson
try:
@ -13,6 +13,10 @@ try:
except NameError:
basestring = str
try:
unicode
except NameError:
unicode = str
from .tokenizer import Tokenizer
from .vocab import Vocab
@ -226,12 +230,21 @@ class Language(object):
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
with (dep_model_dir / 'config.json').open('w') as file_:
json.dump(parser_cfg, file_)
with (ner_model_dir / 'config.json').open('w') as file_:
json.dump(entity_cfg, file_)
with (pos_model_dir / 'config.json').open('w') as file_:
json.dump(tagger_cfg, file_)
with (dep_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(parser_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
with (ner_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(entity_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
with (pos_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(tagger_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
self = cls(
path=path,
@ -252,7 +265,7 @@ class Language(object):
self.entity = self.Defaults.create_entity(self)
self.pipeline = self.Defaults.create_pipeline(self)
yield Trainer(self, gold_tuples)
self.end_training()
self.end_training(path=path)
def __init__(self, **overrides):
if 'data_dir' in overrides and 'path' not in overrides:
@ -391,12 +404,14 @@ class Language(object):
else:
entity_iob_freqs = []
entity_type_freqs = []
with (path / 'vocab' / 'serializer.json').open('w') as file_:
file_.write(
json.dumps([
(TAG, tagger_freqs),
(DEP, dep_freqs),
(ENT_IOB, entity_iob_freqs),
(ENT_TYPE, entity_type_freqs),
(HEAD, head_freqs)
]))
with (path / 'vocab' / 'serializer.json').open('wb') as file_:
data = ujson.dumps([
(TAG, tagger_freqs),
(DEP, dep_freqs),
(ENT_IOB, entity_iob_freqs),
(ENT_TYPE, entity_type_freqs),
(HEAD, head_freqs)
])
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)

View File

@ -19,6 +19,7 @@ TAG_MAP = {
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ}, # U20
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART}

View File

@ -37,7 +37,7 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
Definite_two
Definite_def
Definite_red
Definite_cons # U20
Definite_ind
Degree_cmp
Degree_comp
@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
Negative_neg
Negative_pos
Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com
Number_dual
Number_none
@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la
Voice_act
Voice_cau

View File

@ -192,6 +192,7 @@ IDS = {
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
@ -215,6 +216,8 @@ IDS = {
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
@ -263,6 +266,7 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,

View File

@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
ADV
AUX
CONJ
CCONJ # U20
DET
INTJ
NOUN

View File

@ -7,7 +7,8 @@ IDS = {
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"CONJ": CONJ, # U20
"CCONJ": CCONJ,
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals, absolute_import
cimport cython
from libc.string cimport memcpy
from libc.stdint cimport uint64_t
from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32
@ -12,22 +12,19 @@ from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t
from libc.stdint cimport uint32_t
try:
import ujson as json
except ImportError:
import json
import ujson
cpdef hash_t hash_string(unicode string) except 0:
chars = string.encode('utf8')
return _hash_utf8(chars, len(chars))
return hash_utf8(chars, len(chars))
cdef hash_t _hash_utf8(char* utf8_string, int length):
cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
return hash64(utf8_string, length, 1)
cdef uint32_t _hash32_utf8(char* utf8_string, int length):
cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1)
@ -48,11 +45,11 @@ cdef unicode _decode(const Utf8Str* string):
return string.p[i:length + i].decode('utf8')
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef int n_length_bytes
cdef int i
cdef Utf8Str string
assert length != 0
cdef uint32_t ulength = length
if length < sizeof(string.s):
string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length)
@ -98,6 +95,14 @@ cdef class StringStore:
def __get__(self):
return self.size -1
def __reduce__(self):
# TODO: OOV words, for the is_frozen stuff?
if self.is_frozen:
raise NotImplementedError(
"Currently missing support for pickling StringStore when "
"is_frozen=True")
return (StringStore, (list(self),))
def __len__(self):
"""The number of strings in the store.
@ -149,7 +154,7 @@ cdef class StringStore:
# pretty bad.
# We could also get unlucky here, and hash into a value that
# collides with the 'real' strings.
return _hash32_utf8(byte_string, len(byte_string))
return hash32_utf8(byte_string, len(byte_string))
else:
return utf8str - self.c
@ -200,7 +205,7 @@ cdef class StringStore:
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
# TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = _hash_utf8(utf8_string, length)
cdef hash_t key = hash_utf8(utf8_string, length)
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL:
return value
@ -209,7 +214,7 @@ cdef class StringStore:
return value
if self.is_frozen:
# OOV store uses 32 bit hashes. Pretty ugly :(
key32 = _hash32_utf8(utf8_string, length)
key32 = hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial
# to flush them all.
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
@ -232,7 +237,7 @@ cdef class StringStore:
Returns:
None
"""
string_data = json.dumps(list(self))
string_data = ujson.dumps(list(self))
if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8')
# TODO: OOV?
@ -246,7 +251,7 @@ cdef class StringStore:
Returns:
None
"""
strings = json.load(file_)
strings = ujson.load(file_)
if strings == ['']:
return None
cdef unicode string
@ -271,7 +276,7 @@ cdef class StringStore:
# Find array index with pointer arithmetic
offset = ((<Utf8Str*>value) - self.c)
keys[offset] = key
self._resize_at *= 2
cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)

View File

@ -13,7 +13,7 @@ cpdef enum symbol_t:
LIKE_EMAIL
IS_STOP
IS_OOV
FLAG14 = 14
FLAG15
FLAG16
@ -90,6 +90,7 @@ cpdef enum symbol_t:
ADV
AUX
CONJ
CCONJ # U20
DET
INTJ
NOUN
@ -107,11 +108,14 @@ cpdef enum symbol_t:
Animacy_anim
Animacy_inam
Animacy_hum # U20
Aspect_freq
Aspect_imp
Aspect_mod
Aspect_none
Aspect_perf
Aspect_iter # U20
Aspect_hab # U20
Case_abe
Case_abl
Case_abs
@ -120,10 +124,12 @@ cpdef enum symbol_t:
Case_all
Case_cau
Case_com
Case_cmp # U20
Case_dat
Case_del
Case_dis
Case_ela
Case_equ # U20
Case_ess
Case_gen
Case_ill
@ -142,7 +148,9 @@ cpdef enum symbol_t:
Definite_two
Definite_def
Definite_red
Definite_cons # U20
Definite_ind
Definite_spec # U20
Degree_cmp
Degree_comp
Degree_none
@ -151,6 +159,8 @@ cpdef enum symbol_t:
Degree_abs
Degree_com
Degree_dim # du
Degree_equ # U20
Evident_nfh # U20
Gender_com
Gender_fem
Gender_masc
@ -162,16 +172,21 @@ cpdef enum symbol_t:
Mood_pot
Mood_sub
Mood_opt
Mood_prp # U20
Mood_adm # U20
Negative_neg
Negative_pos
Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com
Number_dual
Number_none
Number_plur
Number_sing
Number_ptan # bg
Number_count # bg
Number_count # bg, U20
Number_tri # U20
NumType_card
NumType_dist
NumType_frac
@ -197,7 +212,8 @@ cpdef enum symbol_t:
PronType_rel
PronType_tot
PronType_clit
PronType_exc # es, ca, it, fa
PronType_exc # es, ca, it, fa, U20
PronType_emp # U20
Reflex_yes
Tense_fut
Tense_imp
@ -213,12 +229,17 @@ cpdef enum symbol_t:
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la
VerbForm_vnoun # U20
Voice_act
Voice_cau
Voice_pass
Voice_mid # gkc
Voice_mid # gkc, U20
Voice_int # hb
Voice_antip # U20
Voice_dir # U20
Voice_inv # U20
Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U
AdpType_post # U
@ -284,6 +305,10 @@ cpdef enum symbol_t:
Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U
Number_pauc # U20
Number_grpa # U20
Number_grpl # U20
Number_inv # U20
NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U
@ -311,6 +336,8 @@ cpdef enum symbol_t:
Person_psor_one # fi, U
Person_psor_two # fi, U
Person_psor_three # fi, U
Person_zero # U20
Person_four # U20
Polite_inf # bq, U
Polite_pol # bq, U
Polite_abs_inf # bq, U
@ -319,6 +346,10 @@ cpdef enum symbol_t:
Polite_erg_pol # bq, U
Polite_dat_inf # bq, U
Polite_dat_pol # bq, U
Polite_infm # U20
Polite_form # U20
Polite_form_elev # U20
Polite_form_humb # U20
Prefix_yes # U
PrepCase_npr # cz
PrepCase_pre # U
@ -383,6 +414,7 @@ cpdef enum symbol_t:
ccomp
complm
conj
cop # U20
csubj
csubjpass
dep
@ -405,6 +437,8 @@ cpdef enum symbol_t:
num
number
oprd
obj # U20
obl # U20
parataxis
partmod
pcomp

View File

@ -91,6 +91,7 @@ IDS = {
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"CCONJ": CCONJ, # U20
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
@ -108,11 +109,14 @@ IDS = {
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Animacy_hum": Animacy_hum, # U20
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf,
"Aspect_iter": Aspect_iter, # U20
"Aspect_hab": Aspect_hab, # U20
"Case_abe": Case_abe,
"Case_abl": Case_abl,
"Case_abs": Case_abs,
@ -121,10 +125,12 @@ IDS = {
"Case_all": Case_all,
"Case_cau": Case_cau,
"Case_com": Case_com,
"Case_cmp": Case_cmp, # U20
"Case_dat": Case_dat,
"Case_del": Case_del,
"Case_dis": Case_dis,
"Case_ela": Case_ela,
"Case_equ": Case_equ, # U20
"Case_ess": Case_ess,
"Case_gen": Case_gen,
"Case_ill": Case_ill,
@ -143,7 +149,9 @@ IDS = {
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind,
"Definite_spec": Definite_spec, # U20
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
"Degree_none": Degree_none,
@ -152,6 +160,8 @@ IDS = {
"Degree_abs": Degree_abs,
"Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du
"Degree_equ": Degree_equ, # U20
"Evident_nfh": Evident_nfh, # U20
"Gender_com": Gender_com,
"Gender_fem": Gender_fem,
"Gender_masc": Gender_masc,
@ -163,16 +173,21 @@ IDS = {
"Mood_pot": Mood_pot,
"Mood_sub": Mood_sub,
"Mood_opt": Mood_opt,
"Mood_prp": Mood_prp, # U20
"Mood_adm": Mood_adm, # U20
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
"Number_plur": Number_plur,
"Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg
"Number_count ": Number_count, # bg, U20
"Number_tri": Number_tri, # U20
"NumType_card": NumType_card,
"NumType_dist": NumType_dist,
"NumType_frac": NumType_frac,
@ -198,7 +213,8 @@ IDS = {
"PronType_rel": PronType_rel,
"PronType_tot": PronType_tot,
"PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa,
"PronType_exc": PronType_exc, # es, ca, it, fa, U20
"PronType_emp": PronType_emp, # U20
"Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut,
"Tense_imp": Tense_imp,
@ -214,12 +230,17 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la,
"VerbForm_vnoun": VerbForm_vnoun, # U20
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,
"Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc,
"Voice_mid ": Voice_mid, # gkc, U20
"Voice_int ": Voice_int, # hb,
"Voice_antip": Voice_antip, # U20
"Voice_dir": Voice_dir, # U20
"Voice_inv": Voice_inv, # U20
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U,
@ -285,6 +306,10 @@ IDS = {
"Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"Number_pauc": Number_pauc, # U20
"Number_grpa": Number_grpa, # U20
"Number_grpl": Number_grpl, # U20
"Number_inv": Number_inv, # U20
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
@ -312,6 +337,8 @@ IDS = {
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Person_zero ": Person_zero, # U20
"Person_four ": Person_four, # U20
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
@ -320,6 +347,10 @@ IDS = {
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Polite_infm ": Polite_infm, # U20
"Polite_form ": Polite_form, # U20
"Polite_form_elev ": Polite_form_elev, # U20
"Polite_form_humb ": Polite_form_humb, # U20
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
@ -384,6 +415,7 @@ IDS = {
"ccomp": ccomp,
"complm": complm,
"conj": conj,
"cop": cop, # U20
"csubj": csubj,
"csubjpass": csubjpass,
"dep": dep,
@ -406,6 +438,8 @@ IDS = {
"num": num,
"number": number,
"oprd": oprd,
"obj": obj, # U20
"obl": obl, # U20
"parataxis": parataxis,
"partmod": partmod,
"pcomp": pcomp,

View File

@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD
def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root
# the path to root cannot be longer than the number of words in the sentence
# this function ends after at most len(heads) steps
# this function ends after at most len(heads) steps
# because it would otherwise loop indefinitely on cycles
head = tokenid
cnt = 0
@ -180,7 +180,7 @@ class PseudoProjectivity:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space: continue
if child.is_space: continue
if child == token: continue
if child.dep_ == headlabel:
return child

View File

@ -68,7 +68,7 @@ def get_templates(name):
cdef class ParserModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
fill_context(eg.atoms, state)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
@ -124,6 +124,8 @@ cdef class Parser:
elif 'features' not in cfg:
cfg['features'] = self.feature_templates
self.model = ParserModel(cfg['features'])
self.model.l1_penalty = cfg.get('L1', 0.0)
self.cfg = cfg
def __reduce__(self):
@ -232,7 +234,7 @@ cdef class Parser:
free(eg.scores)
free(eg.is_valid)
return 0
def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model.
@ -258,15 +260,20 @@ cdef class Parser:
self.model.set_featuresC(&eg.c, stcls.c)
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
self.model.updateC(&eg.c)
self.model.time += 1
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
if eg.c.costs[guess] > 0:
best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
for feat in eg.c.features[:eg.c.nr_feat]:
self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.c.costs[guess])
self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.c.costs[guess])
action = self.moves.c[eg.guess]
action = self.moves.c[guess]
action.do(stcls.c, action.label)
loss += eg.costs[eg.guess]
eg.fill_scores(0, eg.nr_class)
eg.fill_costs(0, eg.nr_class)
eg.fill_is_valid(1, eg.nr_class)
loss += eg.costs[guess]
eg.fill_scores(0, eg.c.nr_class)
eg.fill_costs(0, eg.c.nr_class)
eg.fill_is_valid(1, eg.c.nr_class)
return loss
def step_through(self, Doc doc):
@ -296,7 +303,7 @@ cdef class Parser:
# Doesn't set label into serializer -- subclasses override it to do that.
for action in self.moves.action_types:
self.moves.add_action(action, label)
cdef class StepwiseState:
cdef readonly StateClass stcls
@ -385,6 +392,14 @@ class ParserStateError(ValueError):
"Please include the text that the parser failed on, which is:\n"
"%s" % repr(doc.text))
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil:
cdef int best = -1
for i in range(n):
if costs[i] <= 0:
if best == -1 or scores[i] > scores[best]:
best = i
return best
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
int nr_class) except -1:

View File

@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec
from .typedefs cimport attr_t
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .gold cimport GoldParse
from .attrs cimport *
cpdef enum:
P2_orth
P2_cluster
@ -71,7 +71,7 @@ cpdef enum:
cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
@ -152,6 +152,7 @@ cdef class Tagger:
model = TaggerModel(cfg.get('features', self.feature_templates))
self.vocab = vocab
self.model = model
self.model.l1_penalty = 0.0
# TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
@ -191,7 +192,7 @@ cdef class Tagger:
nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat)
for i in range(tokens.length):
if tokens.c[i].pos == 0:
if tokens.c[i].pos == 0:
self.model.set_featuresC(&eg.c, tokens.c, i)
self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat)
@ -217,7 +218,7 @@ cdef class Tagger:
for doc in stream:
self(doc)
yield doc
def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model, with tags supplied for the given document.
@ -251,7 +252,7 @@ cdef class Tagger:
self.model.updateC(&eg.c)
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
correct += eg.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class)

View File

@ -4,9 +4,15 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
doc = en_tokenizer(text)
assert doc.text_with_ws == text
assert ''.join([token.text_with_ws for token in doc]) == text
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text):
"""Test base case for Issue #792: Non-trailing whitespace"""
doc = en_tokenizer(text)
assert ''.join([token.text_with_ws for token in doc]) == text

View File

@ -0,0 +1,52 @@
'''
Test Matcher matches with '*' operator and Boolean flag
'''
from __future__ import unicode_literals
from __future__ import print_function
import pytest
from ...matcher import Matcher
from ...vocab import Vocab
from ...attrs import LOWER
from ...tokens import Doc
def test_basic_case():
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern(
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', LOWER: 'and'},
{LOWER: 'frank'}
])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc)
assert len(match) == 1
ent_id, label, start, end = match[0]
assert start == 0
assert end == 4
@pytest.mark.xfail
def test_issue850():
'''The problem here is that the variable-length pattern matches the
succeeding token. We then don't handle the ambiguity correctly.'''
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern(
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', IS_ANY_TOKEN: True},
{LOWER: 'frank'}
])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc)
assert len(match) == 1
ent_id, label, start, end = match[0]
assert start == 0
assert end == 4

View File

@ -0,0 +1,12 @@
# encoding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
"aaabbb@ccc.com \nThank you!"])
def test_issue859(en_tokenizer, text):
"""Test that no extra space is added in doc.text method."""
doc = en_tokenizer(text)
assert doc.text == text

View File

@ -0,0 +1,40 @@
from __future__ import unicode_literals
import io
import pytest
import dill as pickle
from ..strings import StringStore
from ..vocab import Vocab
from ..attrs import NORM
def test_pickle_string_store():
sstore = StringStore()
hello = sstore['hello']
bye = sstore['bye']
bdata = pickle.dumps(sstore, protocol=-1)
unpickled = pickle.loads(bdata)
assert unpickled['hello'] == hello
assert unpickled['bye'] == bye
assert len(sstore) == len(unpickled)
@pytest.mark.xfail
def test_pickle_vocab():
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
dog = vocab[u'dog']
cat = vocab[u'cat']
assert dog.norm_ == 'do'
assert cat.norm_ == 'ca'
bdata = pickle.dumps(vocab)
unpickled = pickle.loads(bdata)
assert unpickled[u'dog'].orth == dog.orth
assert unpickled[u'cat'].orth == cat.orth
assert unpickled[u'dog'].norm == dog.norm
assert unpickled[u'cat'].norm == cat.norm
dog_ = unpickled[u'dog']
cat_ = unpickled[u'cat']
assert dog_.norm != cat_.norm

View File

@ -163,7 +163,6 @@ cdef class Tokenizer:
start = i
in_ws = not in_ws
i += 1
i += 1
if start < i:
span = string[start:]
key = hash_string(span)
@ -275,7 +274,10 @@ cdef class Tokenizer:
if cache_hit:
pass
elif self.token_match and self.token_match(string):
tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
matches = self.find_infix(string)
if not matches:

View File

@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme
from .span cimport Span
@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc:
"""
A sequence of `Token` objects. Access sentences and named entities,
export annotations to numpy arrays, losslessly serialize to compressed
A sequence of `Token` objects. Access sentences and named entities,
export annotations to numpy arrays, losslessly serialize to compressed
binary strings.
Aside: Internals
The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this
The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves.
Code: Construction 1
@ -80,13 +80,13 @@ cdef class Doc:
Create a Doc object.
Aside: Implementation
This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via
This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via
a call to the language object.
Arguments:
vocab:
A Vocabulary object, which must match any models you want to
A Vocabulary object, which must match any models you want to
use (e.g. tokenizer, parser, entity recognizer).
words:
@ -156,19 +156,19 @@ cdef class Doc:
if self.length == 0:
self.is_tagged = True
self.is_parsed = True
def __getitem__(self, object i):
'''
doc[i]
Get the Token object at position i, where i is an integer.
Negative indexing is supported, and follows the usual Python
Get the Token object at position i, where i is an integer.
Negative indexing is supported, and follows the usual Python
semantics, i.e. doc[-2] is doc[len(doc) - 2].
doc[start : end]]
Get a `Span` object, starting at position `start`
and ending at position `end`, where `start` and
`end` are token indices. For instance,
`doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
`doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their
normal Python semantics.
@ -188,11 +188,11 @@ cdef class Doc:
def __iter__(self):
'''
for token in doc
Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token
objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the
Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token
objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the
underlying C data directly from Cython.
'''
cdef int i
@ -251,13 +251,13 @@ cdef class Doc:
def __get__(self):
if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self)
return any(token.has_vector for token in self)
property vector:
'''
A real-valued meaning representation. Defaults to an average of the token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32']
'''
def __get__(self):
@ -285,14 +285,14 @@ cdef class Doc:
norm += value * value
self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm
def __set__(self, value):
self._vector_norm = value
self._vector_norm = value
@property
def string(self):
return self.text
property text:
'''A unicode representation of the document text.'''
def __get__(self):
@ -306,7 +306,7 @@ cdef class Doc:
property ents:
'''
Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get
has been applied to the document. Iterate over the span to get
individual Token objects, or access the label:
Example:
@ -352,7 +352,7 @@ cdef class Doc:
cdef int i
for i in range(self.length):
self.c[i].ent_type = 0
# At this point we don't know whether the NER has run over the
# At this point we don't know whether the NER has run over the
# Doc. If the ent_iob is missing, leave it missing.
if self.c[i].ent_iob != 0:
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
@ -384,9 +384,9 @@ cdef class Doc:
property noun_chunks:
'''
Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example:
'''
def __get__(self):
@ -422,7 +422,7 @@ cdef class Doc:
def __get__(self):
if 'sents' in self.user_hooks:
return self.user_hooks['sents'](self)
if not self.is_parsed:
raise ValueError(
"sentence boundary detection requires the dependency parse, which "
@ -465,8 +465,8 @@ cdef class Doc:
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""
Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length
Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length
of the document. The values will be 32-bit integers.
Example:
@ -474,7 +474,7 @@ cdef class Doc:
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
Arguments:
attr_ids (list[int]): A list of attribute ID ints.
@ -520,7 +520,7 @@ cdef class Doc:
cdef int i
cdef attr_t attr
cdef size_t count
if counts is None:
counts = PreshCounter()
output_dict = True
@ -570,7 +570,7 @@ cdef class Doc:
cdef TokenC* tokens = self.c
cdef int length = len(array)
cdef attr_t[:] values
for col, attr_id in enumerate(attrs):
for col, attr_id in enumerate(attrs):
values = array[:, col]
if attr_id == HEAD:
for i in range(length):
@ -612,11 +612,11 @@ cdef class Doc:
'''Deserialize, loading from bytes.'''
self.vocab.serializer.unpack_into(data[4:], self)
return self
@staticmethod
def read_bytes(file_):
'''
A static method, used to read serialized #[code Doc] objects from
A static method, used to read serialized #[code Doc] objects from
a file. For example:
Example:
@ -673,7 +673,7 @@ cdef class Doc:
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
if child.l_edge < head.l_edge:
head.l_edge = child.l_edge
head.l_kids += 1
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
for i in range(length):
if tokens[i].head == 0 and tokens[i].dep != 0:
tokens[tokens[i].l_edge].sent_start = True

View File

@ -20,7 +20,7 @@ from .. import parts_of_speech
from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT
from ..parts_of_speech cimport CCONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET
@ -84,7 +84,7 @@ cdef class Token:
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
'''Check the value of a boolean flag.
Arguments:
flag_id (int): The ID of the flag attribute.
Returns:
@ -225,7 +225,7 @@ cdef class Token:
property vector:
'''
A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32']
'''
def __get__(self):
@ -343,7 +343,7 @@ cdef class Token:
'''
def __get__(self):
cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have
# guard against infinite loop, no token can have
# more ancestors than tokens in the tree
cdef int i = 0
while head_ptr.head != 0 and i < self.doc.length:
@ -370,7 +370,7 @@ cdef class Token:
property head:
'''The syntactic parent, or "governor", of this token.
Returns: Token
'''
def __get__(self):
@ -390,7 +390,7 @@ cdef class Token:
# is the new head a descendant of the old head
cdef bint is_desc = old_head.is_ancestor_of(new_head)
cdef int new_edge
cdef Token anc, child
@ -420,7 +420,7 @@ cdef class Token:
if anc.c.l_edge <= new_edge:
break
anc.c.l_edge = new_edge
elif self.c.head < 0: # right dependent
old_head.c.r_kids -= 1
# do the same thing as for l_edge
@ -435,7 +435,7 @@ cdef class Token:
if child.c.r_edge > new_edge:
new_edge = child.c.r_edge
old_head.c.r_edge = new_edge
for anc in old_head.ancestors:
if anc.c.r_edge >= new_edge:
break
@ -598,19 +598,19 @@ cdef class Token:
property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space:
property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket:
property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote:
property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct:
property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct:
property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url:

View File

@ -9,11 +9,16 @@ import bz2
import ujson as json
import re
try:
import cPickle as pickle
except ImportError:
import pickle
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
from .strings cimport hash_string
from .typedefs cimport attr_t
from .cfile cimport CFile
from .cfile cimport CFile, StringCFile
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .tokens.token cimport Token
@ -346,17 +351,18 @@ cdef class Vocab:
Token.set_struct_attr(token, attr_id, value)
return tokens
def dump(self, loc):
"""Save the lexemes binary data to the given location.
def dump(self, loc=None):
"""Save the lexemes binary data to the given location, or
return a byte-string with the data if loc is None.
Arguments:
loc (Path): The path to save to.
loc (Path or None): The path to save to, or None.
"""
if hasattr(loc, 'as_posix'):
loc = loc.as_posix()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef CFile fp = CFile(bytes_loc, 'wb')
cdef CFile fp
if loc is None:
fp = StringCFile('wb')
else:
fp = CFile(loc, 'wb')
cdef size_t st
cdef size_t addr
cdef hash_t key
@ -378,6 +384,8 @@ cdef class Vocab:
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
fp.close()
if loc is None:
return fp.string_data()
def load_lexemes(self, loc):
'''Load the binary vocabulary data from the given location.
@ -427,6 +435,60 @@ cdef class Vocab:
i += 1
fp.close()
def _deserialize_lexemes(self, CFile fp):
'''Load the binary vocabulary data from the given CFile.
'''
cdef LexemeC* lexeme
cdef hash_t key
cdef unicode py_str
cdef attr_t orth
assert sizeof(orth) == sizeof(lexeme.orth)
i = 0
cdef int todo = fp.size
cdef int lex_size = sizeof(lexeme.flags)
lex_size += sizeof(lexeme.id)
lex_size += sizeof(lexeme.length)
lex_size += sizeof(lexeme.orth)
lex_size += sizeof(lexeme.lower)
lex_size += sizeof(lexeme.norm)
lex_size += sizeof(lexeme.shape)
lex_size += sizeof(lexeme.prefix)
lex_size += sizeof(lexeme.suffix)
lex_size += sizeof(lexeme.cluster)
lex_size += sizeof(lexeme.prob)
lex_size += sizeof(lexeme.sentiment)
lex_size += sizeof(lexeme.l2_norm)
lex_size += sizeof(lexeme.lang)
while True:
if todo < lex_size:
break
todo -= lex_size
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
# Copy data from the file into the lexeme
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
fp.read_into(&lexeme.length, 1, sizeof(lexeme.length))
fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth))
fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower))
fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm))
fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape))
fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix))
fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix))
fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster))
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth]
key = hash_string(py_str)
self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme)
self.length += 1
i += 1
fp.close()
def dump_vectors(self, out_loc):
'''Save the word vectors to a binary file.
@ -553,6 +615,42 @@ cdef class Vocab:
return vec_len
def pickle_vocab(vocab):
sstore = vocab.strings
morph = vocab.morphology
length = vocab.length
serializer = vocab._serializer
data_dir = vocab.data_dir
lex_attr_getters = vocab.lex_attr_getters
lexemes_data = vocab.dump()
vectors_length = vocab.vectors_length
return (unpickle_vocab,
(sstore, morph, serializer, data_dir, lex_attr_getters,
lexemes_data, length, vectors_length))
def unpickle_vocab(sstore, morphology, serializer, data_dir,
lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
cdef Vocab vocab = Vocab()
vocab.length = length
vocab.vectors_length = vectors_length
vocab.strings = sstore
cdef CFile fp = StringCFile('r', data=lexemes_data)
vocab.morphology = morphology
vocab._serializer = serializer
vocab.data_dir = data_dir
vocab.lex_attr_getters = lex_attr_getters
vocab._deserialize_lexemes(fp)
vocab.length = length
vocab.vectors_length = vectors_length
return vocab
copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
def write_binary_vectors(in_loc, out_loc):
cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem