mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-05 13:43:24 +03:00
Merge branch 'develop'
This commit is contained in:
commit
ea53647362
|
@ -66,8 +66,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
||||||
|
|
||||||
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
||||||
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
|
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
|
||||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
||||||
format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
|
format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
|
||||||
with Language.train(model_dir, train_data,
|
with Language.train(model_dir, train_data,
|
||||||
tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
||||||
loss = 0
|
loss = 0
|
||||||
|
@ -76,11 +76,13 @@ def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, ent
|
||||||
for doc, gold in epoch:
|
for doc, gold in epoch:
|
||||||
trainer.update(doc, gold)
|
trainer.update(doc, gold)
|
||||||
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
|
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
|
||||||
print(format_str.format(itn, loss, **dev_scores.scores))
|
print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
|
||||||
|
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||||
beam_width=None, cand_preproc=None):
|
beam_width=None, cand_preproc=None):
|
||||||
|
print("Load parser", model_dir)
|
||||||
nlp = Language(path=model_dir)
|
nlp = Language(path=model_dir)
|
||||||
if nlp.lang == 'de':
|
if nlp.lang == 'de':
|
||||||
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
|
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
|
||||||
|
@ -145,9 +147,11 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
|
||||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
verbose=("Verbose error reporting", "flag", "v", bool),
|
||||||
debug=("Debug mode", "flag", "d", bool),
|
debug=("Debug mode", "flag", "d", bool),
|
||||||
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
|
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
|
||||||
|
L1=("L1 regularization penalty", "option", "L", float),
|
||||||
)
|
)
|
||||||
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
||||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
|
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
|
||||||
|
L1=1e-6):
|
||||||
parser_cfg = dict(locals())
|
parser_cfg = dict(locals())
|
||||||
tagger_cfg = dict(locals())
|
tagger_cfg = dict(locals())
|
||||||
entity_cfg = dict(locals())
|
entity_cfg = dict(locals())
|
||||||
|
@ -160,6 +164,8 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc=
|
||||||
if not eval_only:
|
if not eval_only:
|
||||||
gold_train = list(read_json_file(train_loc))
|
gold_train = list(read_json_file(train_loc))
|
||||||
gold_dev = list(read_json_file(dev_loc))
|
gold_dev = list(read_json_file(dev_loc))
|
||||||
|
if n_sents > 0:
|
||||||
|
gold_train = gold_train[:n_sents]
|
||||||
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
||||||
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
|
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
|
||||||
n_iter=n_iter)
|
n_iter=n_iter)
|
||||||
|
|
|
@ -10,3 +10,4 @@ six
|
||||||
ujson>=1.35
|
ujson>=1.35
|
||||||
cloudpickle
|
cloudpickle
|
||||||
sputnik>=0.9.2,<0.10.0
|
sputnik>=0.9.2,<0.10.0
|
||||||
|
dill>=0.2,<0.3
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -241,7 +241,8 @@ def setup_package():
|
||||||
'cloudpickle',
|
'cloudpickle',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
'sputnik>=0.9.2,<0.10.0',
|
'sputnik>=0.9.2,<0.10.0',
|
||||||
'ujson>=1.35'],
|
'ujson>=1.35',
|
||||||
|
'dill>=0.2,<0.3'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 5 - Production/Stable',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
|
|
|
@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
||||||
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
||||||
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
||||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case']
|
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
||||||
|
'Polarity', # U20
|
||||||
|
]
|
||||||
for key in morph_keys:
|
for key in morph_keys:
|
||||||
if key in stringy_attrs:
|
if key in stringy_attrs:
|
||||||
stringy_attrs.pop(key)
|
stringy_attrs.pop(key)
|
||||||
|
|
|
@ -4,6 +4,20 @@ from cymem.cymem cimport Pool
|
||||||
cdef class CFile:
|
cdef class CFile:
|
||||||
cdef FILE* fp
|
cdef FILE* fp
|
||||||
cdef bint is_open
|
cdef bint is_open
|
||||||
|
cdef Pool mem
|
||||||
|
cdef int size # For compatibility with subclass
|
||||||
|
cdef int _capacity # For compatibility with subclass
|
||||||
|
|
||||||
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
cdef class StringCFile(CFile):
|
||||||
|
cdef unsigned char* data
|
||||||
|
|
||||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
|
from libc.string cimport memcpy
|
||||||
|
|
||||||
|
|
||||||
cdef class CFile:
|
cdef class CFile:
|
||||||
|
@ -9,6 +10,7 @@ cdef class CFile:
|
||||||
mode_str = mode
|
mode_str = mode
|
||||||
if hasattr(loc, 'as_posix'):
|
if hasattr(loc, 'as_posix'):
|
||||||
loc = loc.as_posix()
|
loc = loc.as_posix()
|
||||||
|
self.mem = Pool()
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
self.fp = fopen(<char*>bytes_loc, mode_str)
|
self.fp = fopen(<char*>bytes_loc, mode_str)
|
||||||
if self.fp == NULL:
|
if self.fp == NULL:
|
||||||
|
@ -45,3 +47,42 @@ cdef class CFile:
|
||||||
cdef bytes py_bytes = value.encode('utf8')
|
cdef bytes py_bytes = value.encode('utf8')
|
||||||
cdef char* chars = <char*>py_bytes
|
cdef char* chars = <char*>py_bytes
|
||||||
self.write(sizeof(char), len(py_bytes), chars)
|
self.write(sizeof(char), len(py_bytes), chars)
|
||||||
|
|
||||||
|
|
||||||
|
cdef class StringCFile:
|
||||||
|
def __init__(self, mode, bytes data=b'', on_open_error=None):
|
||||||
|
self.mem = Pool()
|
||||||
|
self.is_open = 'w' in mode
|
||||||
|
self._capacity = max(len(data), 8)
|
||||||
|
self.size = len(data)
|
||||||
|
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
|
||||||
|
for i in range(len(data)):
|
||||||
|
self.data[i] = data[i]
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.is_open = False
|
||||||
|
|
||||||
|
def string_data(self):
|
||||||
|
return (self.data-self.size)[:self.size]
|
||||||
|
|
||||||
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||||
|
memcpy(dest, self.data, elem_size * number)
|
||||||
|
self.data += elem_size * number
|
||||||
|
|
||||||
|
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
|
||||||
|
write_size = number * elem_size
|
||||||
|
if (self.size + write_size) >= self._capacity:
|
||||||
|
self._capacity = (self.size + write_size) * 2
|
||||||
|
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
|
||||||
|
memcpy(&self.data[self.size], src, elem_size * number)
|
||||||
|
self.size += write_size
|
||||||
|
|
||||||
|
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||||
|
cdef void* dest = mem.alloc(number, elem_size)
|
||||||
|
self.read_into(dest, number, elem_size)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
def write_unicode(self, unicode value):
|
||||||
|
cdef bytes py_bytes = value.encode('utf8')
|
||||||
|
cdef char* chars = <char*>py_bytes
|
||||||
|
self.write(sizeof(char), len(py_bytes), chars)
|
||||||
|
|
|
@ -41,7 +41,7 @@ TAG_MAP = {
|
||||||
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
|
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
|
||||||
"PTKA": {POS: PART},
|
"PTKA": {POS: PART},
|
||||||
"PTKANT": {POS: PART, "PartType": "res"},
|
"PTKANT": {POS: PART, "PartType": "res"},
|
||||||
"PTKNEG": {POS: PART, "Negative": "yes"},
|
"PTKNEG": {POS: PART, "Polarity": "Neg"},
|
||||||
"PTKVZ": {POS: PART, "PartType": "vbp"},
|
"PTKVZ": {POS: PART, "PartType": "vbp"},
|
||||||
"PTKZU": {POS: PART, "PartType": "inf"},
|
"PTKZU": {POS: PART, "PartType": "inf"},
|
||||||
"PWAT": {POS: DET, "PronType": "int"},
|
"PWAT": {POS: DET, "PronType": "int"},
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from ..util import match_best_version
|
from ..util import match_best_version
|
||||||
from ..util import get_data_path
|
from ..util import get_data_path
|
||||||
|
@ -13,6 +14,11 @@ from ..attrs import LANG
|
||||||
|
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
||||||
|
try:
|
||||||
|
basestring
|
||||||
|
except NameError:
|
||||||
|
basestring = str
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
lang = 'en'
|
lang = 'en'
|
||||||
|
@ -43,14 +49,15 @@ def _fix_deprecated_glove_vectors_loading(overrides):
|
||||||
data_path = get_data_path()
|
data_path = get_data_path()
|
||||||
else:
|
else:
|
||||||
path = overrides['path']
|
path = overrides['path']
|
||||||
|
if isinstance(path, basestring):
|
||||||
|
path = Path(path)
|
||||||
data_path = path.parent
|
data_path = path.parent
|
||||||
vec_path = None
|
vec_path = None
|
||||||
if 'add_vectors' not in overrides:
|
if 'add_vectors' not in overrides:
|
||||||
if 'vectors' in overrides:
|
if 'vectors' in overrides:
|
||||||
vec_path = match_best_version(overrides['vectors'], None, data_path)
|
vec_path = match_best_version(overrides['vectors'], None, data_path)
|
||||||
if vec_path is None:
|
if vec_path is None:
|
||||||
raise IOError(
|
return overrides
|
||||||
'Could not load data pack %s from %s' % (overrides['vectors'], data_path))
|
|
||||||
else:
|
else:
|
||||||
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
||||||
if vec_path is not None:
|
if vec_path is not None:
|
||||||
|
|
|
@ -16,7 +16,7 @@ TAG_MAP = {
|
||||||
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
"CC": {POS: CCONJ, "ConjType": "coor"},
|
||||||
"CD": {POS: NUM, "NumType": "card"},
|
"CD": {POS: NUM, "NumType": "card"},
|
||||||
"DT": {POS: DET},
|
"DT": {POS: DET},
|
||||||
"EX": {POS: ADV, "AdvType": "ex"},
|
"EX": {POS: ADV, "AdvType": "ex"},
|
||||||
|
|
|
@ -5,7 +5,7 @@ import pathlib
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
import ujson as json
|
import ujson
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -13,6 +13,10 @@ try:
|
||||||
except NameError:
|
except NameError:
|
||||||
basestring = str
|
basestring = str
|
||||||
|
|
||||||
|
try:
|
||||||
|
unicode
|
||||||
|
except NameError:
|
||||||
|
unicode = str
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -226,12 +230,21 @@ class Language(object):
|
||||||
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
||||||
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
||||||
|
|
||||||
with (dep_model_dir / 'config.json').open('w') as file_:
|
with (dep_model_dir / 'config.json').open('wb') as file_:
|
||||||
json.dump(parser_cfg, file_)
|
data = ujson.dumps(parser_cfg)
|
||||||
with (ner_model_dir / 'config.json').open('w') as file_:
|
if isinstance(data, unicode):
|
||||||
json.dump(entity_cfg, file_)
|
data = data.encode('utf8')
|
||||||
with (pos_model_dir / 'config.json').open('w') as file_:
|
file_.write(data)
|
||||||
json.dump(tagger_cfg, file_)
|
with (ner_model_dir / 'config.json').open('wb') as file_:
|
||||||
|
data = ujson.dumps(entity_cfg)
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode('utf8')
|
||||||
|
file_.write(data)
|
||||||
|
with (pos_model_dir / 'config.json').open('wb') as file_:
|
||||||
|
data = ujson.dumps(tagger_cfg)
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode('utf8')
|
||||||
|
file_.write(data)
|
||||||
|
|
||||||
self = cls(
|
self = cls(
|
||||||
path=path,
|
path=path,
|
||||||
|
@ -252,7 +265,7 @@ class Language(object):
|
||||||
self.entity = self.Defaults.create_entity(self)
|
self.entity = self.Defaults.create_entity(self)
|
||||||
self.pipeline = self.Defaults.create_pipeline(self)
|
self.pipeline = self.Defaults.create_pipeline(self)
|
||||||
yield Trainer(self, gold_tuples)
|
yield Trainer(self, gold_tuples)
|
||||||
self.end_training()
|
self.end_training(path=path)
|
||||||
|
|
||||||
def __init__(self, **overrides):
|
def __init__(self, **overrides):
|
||||||
if 'data_dir' in overrides and 'path' not in overrides:
|
if 'data_dir' in overrides and 'path' not in overrides:
|
||||||
|
@ -391,12 +404,14 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
entity_iob_freqs = []
|
entity_iob_freqs = []
|
||||||
entity_type_freqs = []
|
entity_type_freqs = []
|
||||||
with (path / 'vocab' / 'serializer.json').open('w') as file_:
|
with (path / 'vocab' / 'serializer.json').open('wb') as file_:
|
||||||
file_.write(
|
data = ujson.dumps([
|
||||||
json.dumps([
|
|
||||||
(TAG, tagger_freqs),
|
(TAG, tagger_freqs),
|
||||||
(DEP, dep_freqs),
|
(DEP, dep_freqs),
|
||||||
(ENT_IOB, entity_iob_freqs),
|
(ENT_IOB, entity_iob_freqs),
|
||||||
(ENT_TYPE, entity_type_freqs),
|
(ENT_TYPE, entity_type_freqs),
|
||||||
(HEAD, head_freqs)
|
(HEAD, head_freqs)
|
||||||
]))
|
])
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode('utf8')
|
||||||
|
file_.write(data)
|
||||||
|
|
|
@ -19,6 +19,7 @@ TAG_MAP = {
|
||||||
"AUX": {POS: AUX},
|
"AUX": {POS: AUX},
|
||||||
"X": {POS: X},
|
"X": {POS: X},
|
||||||
"CONJ": {POS: CONJ},
|
"CONJ": {POS: CONJ},
|
||||||
|
"CCONJ": {POS: CCONJ}, # U20
|
||||||
"ADJ": {POS: ADJ},
|
"ADJ": {POS: ADJ},
|
||||||
"VERB": {POS: VERB},
|
"VERB": {POS: VERB},
|
||||||
"PART": {POS: PART}
|
"PART": {POS: PART}
|
||||||
|
|
|
@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
|
||||||
Definite_two
|
Definite_two
|
||||||
Definite_def
|
Definite_def
|
||||||
Definite_red
|
Definite_red
|
||||||
|
Definite_cons # U20
|
||||||
Definite_ind
|
Definite_ind
|
||||||
Degree_cmp
|
Degree_cmp
|
||||||
Degree_comp
|
Degree_comp
|
||||||
|
@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
|
||||||
Negative_neg
|
Negative_neg
|
||||||
Negative_pos
|
Negative_pos
|
||||||
Negative_yes
|
Negative_yes
|
||||||
|
Polarity_neg # U20
|
||||||
|
Polarity_pos # U20
|
||||||
Number_com
|
Number_com
|
||||||
Number_dual
|
Number_dual
|
||||||
Number_none
|
Number_none
|
||||||
|
@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
|
||||||
VerbForm_partPres
|
VerbForm_partPres
|
||||||
VerbForm_sup
|
VerbForm_sup
|
||||||
VerbForm_trans
|
VerbForm_trans
|
||||||
|
VerbForm_conv # U20
|
||||||
VerbForm_gdv # la
|
VerbForm_gdv # la
|
||||||
Voice_act
|
Voice_act
|
||||||
Voice_cau
|
Voice_cau
|
||||||
|
|
|
@ -192,6 +192,7 @@ IDS = {
|
||||||
"Definite_two": Definite_two,
|
"Definite_two": Definite_two,
|
||||||
"Definite_def": Definite_def,
|
"Definite_def": Definite_def,
|
||||||
"Definite_red": Definite_red,
|
"Definite_red": Definite_red,
|
||||||
|
"Definite_cons": Definite_cons, # U20
|
||||||
"Definite_ind": Definite_ind,
|
"Definite_ind": Definite_ind,
|
||||||
"Degree_cmp": Degree_cmp,
|
"Degree_cmp": Degree_cmp,
|
||||||
"Degree_comp": Degree_comp,
|
"Degree_comp": Degree_comp,
|
||||||
|
@ -215,6 +216,8 @@ IDS = {
|
||||||
"Negative_neg": Negative_neg,
|
"Negative_neg": Negative_neg,
|
||||||
"Negative_pos": Negative_pos,
|
"Negative_pos": Negative_pos,
|
||||||
"Negative_yes": Negative_yes,
|
"Negative_yes": Negative_yes,
|
||||||
|
"Polarity_neg": Polarity_neg, # U20
|
||||||
|
"Polarity_pos": Polarity_pos, # U20
|
||||||
"Number_com": Number_com,
|
"Number_com": Number_com,
|
||||||
"Number_dual": Number_dual,
|
"Number_dual": Number_dual,
|
||||||
"Number_none": Number_none,
|
"Number_none": Number_none,
|
||||||
|
@ -263,6 +266,7 @@ IDS = {
|
||||||
"VerbForm_partPres": VerbForm_partPres,
|
"VerbForm_partPres": VerbForm_partPres,
|
||||||
"VerbForm_sup": VerbForm_sup,
|
"VerbForm_sup": VerbForm_sup,
|
||||||
"VerbForm_trans": VerbForm_trans,
|
"VerbForm_trans": VerbForm_trans,
|
||||||
|
"VerbForm_conv": VerbForm_conv, # U20
|
||||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||||
"Voice_act": Voice_act,
|
"Voice_act": Voice_act,
|
||||||
"Voice_cau": Voice_cau,
|
"Voice_cau": Voice_cau,
|
||||||
|
|
|
@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
|
||||||
ADV
|
ADV
|
||||||
AUX
|
AUX
|
||||||
CONJ
|
CONJ
|
||||||
|
CCONJ # U20
|
||||||
DET
|
DET
|
||||||
INTJ
|
INTJ
|
||||||
NOUN
|
NOUN
|
||||||
|
|
|
@ -7,7 +7,8 @@ IDS = {
|
||||||
"ADP": ADP,
|
"ADP": ADP,
|
||||||
"ADV": ADV,
|
"ADV": ADV,
|
||||||
"AUX": AUX,
|
"AUX": AUX,
|
||||||
"CONJ": CONJ,
|
"CONJ": CONJ, # U20
|
||||||
|
"CCONJ": CCONJ,
|
||||||
"DET": DET,
|
"DET": DET,
|
||||||
"INTJ": INTJ,
|
"INTJ": INTJ,
|
||||||
"NOUN": NOUN,
|
"NOUN": NOUN,
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals, absolute_import
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t, uint32_t
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64, hash32
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
|
|
||||||
|
@ -12,22 +12,19 @@ from preshed.maps cimport map_iter, key_t
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
try:
|
import ujson
|
||||||
import ujson as json
|
|
||||||
except ImportError:
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0:
|
cpdef hash_t hash_string(unicode string) except 0:
|
||||||
chars = string.encode('utf8')
|
chars = string.encode('utf8')
|
||||||
return _hash_utf8(chars, len(chars))
|
return hash_utf8(chars, len(chars))
|
||||||
|
|
||||||
|
|
||||||
cdef hash_t _hash_utf8(char* utf8_string, int length):
|
cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
|
||||||
return hash64(utf8_string, length, 1)
|
return hash64(utf8_string, length, 1)
|
||||||
|
|
||||||
|
|
||||||
cdef uint32_t _hash32_utf8(char* utf8_string, int length):
|
cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
||||||
return hash32(utf8_string, length, 1)
|
return hash32(utf8_string, length, 1)
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,11 +45,11 @@ cdef unicode _decode(const Utf8Str* string):
|
||||||
return string.p[i:length + i].decode('utf8')
|
return string.p[i:length + i].decode('utf8')
|
||||||
|
|
||||||
|
|
||||||
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
|
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
||||||
cdef int n_length_bytes
|
cdef int n_length_bytes
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef Utf8Str string
|
cdef Utf8Str string
|
||||||
assert length != 0
|
cdef uint32_t ulength = length
|
||||||
if length < sizeof(string.s):
|
if length < sizeof(string.s):
|
||||||
string.s[0] = <unsigned char>length
|
string.s[0] = <unsigned char>length
|
||||||
memcpy(&string.s[1], chars, length)
|
memcpy(&string.s[1], chars, length)
|
||||||
|
@ -98,6 +95,14 @@ cdef class StringStore:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.size -1
|
return self.size -1
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
# TODO: OOV words, for the is_frozen stuff?
|
||||||
|
if self.is_frozen:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Currently missing support for pickling StringStore when "
|
||||||
|
"is_frozen=True")
|
||||||
|
return (StringStore, (list(self),))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of strings in the store.
|
"""The number of strings in the store.
|
||||||
|
|
||||||
|
@ -149,7 +154,7 @@ cdef class StringStore:
|
||||||
# pretty bad.
|
# pretty bad.
|
||||||
# We could also get unlucky here, and hash into a value that
|
# We could also get unlucky here, and hash into a value that
|
||||||
# collides with the 'real' strings.
|
# collides with the 'real' strings.
|
||||||
return _hash32_utf8(byte_string, len(byte_string))
|
return hash32_utf8(byte_string, len(byte_string))
|
||||||
else:
|
else:
|
||||||
return utf8str - self.c
|
return utf8str - self.c
|
||||||
|
|
||||||
|
@ -200,7 +205,7 @@ cdef class StringStore:
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
|
||||||
# TODO: This function's API/behaviour is an unholy mess...
|
# TODO: This function's API/behaviour is an unholy mess...
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef hash_t key = _hash_utf8(utf8_string, length)
|
cdef hash_t key = hash_utf8(utf8_string, length)
|
||||||
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
||||||
if value is not NULL:
|
if value is not NULL:
|
||||||
return value
|
return value
|
||||||
|
@ -209,7 +214,7 @@ cdef class StringStore:
|
||||||
return value
|
return value
|
||||||
if self.is_frozen:
|
if self.is_frozen:
|
||||||
# OOV store uses 32 bit hashes. Pretty ugly :(
|
# OOV store uses 32 bit hashes. Pretty ugly :(
|
||||||
key32 = _hash32_utf8(utf8_string, length)
|
key32 = hash32_utf8(utf8_string, length)
|
||||||
# Important: Make the OOV store own the memory. That way it's trivial
|
# Important: Make the OOV store own the memory. That way it's trivial
|
||||||
# to flush them all.
|
# to flush them all.
|
||||||
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
|
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
|
||||||
|
@ -232,7 +237,7 @@ cdef class StringStore:
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
string_data = json.dumps(list(self))
|
string_data = ujson.dumps(list(self))
|
||||||
if not isinstance(string_data, unicode):
|
if not isinstance(string_data, unicode):
|
||||||
string_data = string_data.decode('utf8')
|
string_data = string_data.decode('utf8')
|
||||||
# TODO: OOV?
|
# TODO: OOV?
|
||||||
|
@ -246,7 +251,7 @@ cdef class StringStore:
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
strings = json.load(file_)
|
strings = ujson.load(file_)
|
||||||
if strings == ['']:
|
if strings == ['']:
|
||||||
return None
|
return None
|
||||||
cdef unicode string
|
cdef unicode string
|
||||||
|
|
|
@ -90,6 +90,7 @@ cpdef enum symbol_t:
|
||||||
ADV
|
ADV
|
||||||
AUX
|
AUX
|
||||||
CONJ
|
CONJ
|
||||||
|
CCONJ # U20
|
||||||
DET
|
DET
|
||||||
INTJ
|
INTJ
|
||||||
NOUN
|
NOUN
|
||||||
|
@ -107,11 +108,14 @@ cpdef enum symbol_t:
|
||||||
|
|
||||||
Animacy_anim
|
Animacy_anim
|
||||||
Animacy_inam
|
Animacy_inam
|
||||||
|
Animacy_hum # U20
|
||||||
Aspect_freq
|
Aspect_freq
|
||||||
Aspect_imp
|
Aspect_imp
|
||||||
Aspect_mod
|
Aspect_mod
|
||||||
Aspect_none
|
Aspect_none
|
||||||
Aspect_perf
|
Aspect_perf
|
||||||
|
Aspect_iter # U20
|
||||||
|
Aspect_hab # U20
|
||||||
Case_abe
|
Case_abe
|
||||||
Case_abl
|
Case_abl
|
||||||
Case_abs
|
Case_abs
|
||||||
|
@ -120,10 +124,12 @@ cpdef enum symbol_t:
|
||||||
Case_all
|
Case_all
|
||||||
Case_cau
|
Case_cau
|
||||||
Case_com
|
Case_com
|
||||||
|
Case_cmp # U20
|
||||||
Case_dat
|
Case_dat
|
||||||
Case_del
|
Case_del
|
||||||
Case_dis
|
Case_dis
|
||||||
Case_ela
|
Case_ela
|
||||||
|
Case_equ # U20
|
||||||
Case_ess
|
Case_ess
|
||||||
Case_gen
|
Case_gen
|
||||||
Case_ill
|
Case_ill
|
||||||
|
@ -142,7 +148,9 @@ cpdef enum symbol_t:
|
||||||
Definite_two
|
Definite_two
|
||||||
Definite_def
|
Definite_def
|
||||||
Definite_red
|
Definite_red
|
||||||
|
Definite_cons # U20
|
||||||
Definite_ind
|
Definite_ind
|
||||||
|
Definite_spec # U20
|
||||||
Degree_cmp
|
Degree_cmp
|
||||||
Degree_comp
|
Degree_comp
|
||||||
Degree_none
|
Degree_none
|
||||||
|
@ -151,6 +159,8 @@ cpdef enum symbol_t:
|
||||||
Degree_abs
|
Degree_abs
|
||||||
Degree_com
|
Degree_com
|
||||||
Degree_dim # du
|
Degree_dim # du
|
||||||
|
Degree_equ # U20
|
||||||
|
Evident_nfh # U20
|
||||||
Gender_com
|
Gender_com
|
||||||
Gender_fem
|
Gender_fem
|
||||||
Gender_masc
|
Gender_masc
|
||||||
|
@ -162,16 +172,21 @@ cpdef enum symbol_t:
|
||||||
Mood_pot
|
Mood_pot
|
||||||
Mood_sub
|
Mood_sub
|
||||||
Mood_opt
|
Mood_opt
|
||||||
|
Mood_prp # U20
|
||||||
|
Mood_adm # U20
|
||||||
Negative_neg
|
Negative_neg
|
||||||
Negative_pos
|
Negative_pos
|
||||||
Negative_yes
|
Negative_yes
|
||||||
|
Polarity_neg # U20
|
||||||
|
Polarity_pos # U20
|
||||||
Number_com
|
Number_com
|
||||||
Number_dual
|
Number_dual
|
||||||
Number_none
|
Number_none
|
||||||
Number_plur
|
Number_plur
|
||||||
Number_sing
|
Number_sing
|
||||||
Number_ptan # bg
|
Number_ptan # bg
|
||||||
Number_count # bg
|
Number_count # bg, U20
|
||||||
|
Number_tri # U20
|
||||||
NumType_card
|
NumType_card
|
||||||
NumType_dist
|
NumType_dist
|
||||||
NumType_frac
|
NumType_frac
|
||||||
|
@ -197,7 +212,8 @@ cpdef enum symbol_t:
|
||||||
PronType_rel
|
PronType_rel
|
||||||
PronType_tot
|
PronType_tot
|
||||||
PronType_clit
|
PronType_clit
|
||||||
PronType_exc # es, ca, it, fa
|
PronType_exc # es, ca, it, fa, U20
|
||||||
|
PronType_emp # U20
|
||||||
Reflex_yes
|
Reflex_yes
|
||||||
Tense_fut
|
Tense_fut
|
||||||
Tense_imp
|
Tense_imp
|
||||||
|
@ -213,12 +229,17 @@ cpdef enum symbol_t:
|
||||||
VerbForm_partPres
|
VerbForm_partPres
|
||||||
VerbForm_sup
|
VerbForm_sup
|
||||||
VerbForm_trans
|
VerbForm_trans
|
||||||
|
VerbForm_conv # U20
|
||||||
VerbForm_gdv # la
|
VerbForm_gdv # la
|
||||||
|
VerbForm_vnoun # U20
|
||||||
Voice_act
|
Voice_act
|
||||||
Voice_cau
|
Voice_cau
|
||||||
Voice_pass
|
Voice_pass
|
||||||
Voice_mid # gkc
|
Voice_mid # gkc, U20
|
||||||
Voice_int # hb
|
Voice_int # hb
|
||||||
|
Voice_antip # U20
|
||||||
|
Voice_dir # U20
|
||||||
|
Voice_inv # U20
|
||||||
Abbr_yes # cz, fi, sl, U
|
Abbr_yes # cz, fi, sl, U
|
||||||
AdpType_prep # cz, U
|
AdpType_prep # cz, U
|
||||||
AdpType_post # U
|
AdpType_post # U
|
||||||
|
@ -284,6 +305,10 @@ cpdef enum symbol_t:
|
||||||
Number_psee_plur # U
|
Number_psee_plur # U
|
||||||
Number_psor_sing # cz, fi, sl, U
|
Number_psor_sing # cz, fi, sl, U
|
||||||
Number_psor_plur # cz, fi, sl, U
|
Number_psor_plur # cz, fi, sl, U
|
||||||
|
Number_pauc # U20
|
||||||
|
Number_grpa # U20
|
||||||
|
Number_grpl # U20
|
||||||
|
Number_inv # U20
|
||||||
NumForm_digit # cz, sl, U
|
NumForm_digit # cz, sl, U
|
||||||
NumForm_roman # cz, sl, U
|
NumForm_roman # cz, sl, U
|
||||||
NumForm_word # cz, sl, U
|
NumForm_word # cz, sl, U
|
||||||
|
@ -311,6 +336,8 @@ cpdef enum symbol_t:
|
||||||
Person_psor_one # fi, U
|
Person_psor_one # fi, U
|
||||||
Person_psor_two # fi, U
|
Person_psor_two # fi, U
|
||||||
Person_psor_three # fi, U
|
Person_psor_three # fi, U
|
||||||
|
Person_zero # U20
|
||||||
|
Person_four # U20
|
||||||
Polite_inf # bq, U
|
Polite_inf # bq, U
|
||||||
Polite_pol # bq, U
|
Polite_pol # bq, U
|
||||||
Polite_abs_inf # bq, U
|
Polite_abs_inf # bq, U
|
||||||
|
@ -319,6 +346,10 @@ cpdef enum symbol_t:
|
||||||
Polite_erg_pol # bq, U
|
Polite_erg_pol # bq, U
|
||||||
Polite_dat_inf # bq, U
|
Polite_dat_inf # bq, U
|
||||||
Polite_dat_pol # bq, U
|
Polite_dat_pol # bq, U
|
||||||
|
Polite_infm # U20
|
||||||
|
Polite_form # U20
|
||||||
|
Polite_form_elev # U20
|
||||||
|
Polite_form_humb # U20
|
||||||
Prefix_yes # U
|
Prefix_yes # U
|
||||||
PrepCase_npr # cz
|
PrepCase_npr # cz
|
||||||
PrepCase_pre # U
|
PrepCase_pre # U
|
||||||
|
@ -383,6 +414,7 @@ cpdef enum symbol_t:
|
||||||
ccomp
|
ccomp
|
||||||
complm
|
complm
|
||||||
conj
|
conj
|
||||||
|
cop # U20
|
||||||
csubj
|
csubj
|
||||||
csubjpass
|
csubjpass
|
||||||
dep
|
dep
|
||||||
|
@ -405,6 +437,8 @@ cpdef enum symbol_t:
|
||||||
num
|
num
|
||||||
number
|
number
|
||||||
oprd
|
oprd
|
||||||
|
obj # U20
|
||||||
|
obl # U20
|
||||||
parataxis
|
parataxis
|
||||||
partmod
|
partmod
|
||||||
pcomp
|
pcomp
|
||||||
|
|
|
@ -91,6 +91,7 @@ IDS = {
|
||||||
"ADV": ADV,
|
"ADV": ADV,
|
||||||
"AUX": AUX,
|
"AUX": AUX,
|
||||||
"CONJ": CONJ,
|
"CONJ": CONJ,
|
||||||
|
"CCONJ": CCONJ, # U20
|
||||||
"DET": DET,
|
"DET": DET,
|
||||||
"INTJ": INTJ,
|
"INTJ": INTJ,
|
||||||
"NOUN": NOUN,
|
"NOUN": NOUN,
|
||||||
|
@ -108,11 +109,14 @@ IDS = {
|
||||||
|
|
||||||
"Animacy_anim": Animacy_anim,
|
"Animacy_anim": Animacy_anim,
|
||||||
"Animacy_inam": Animacy_inam,
|
"Animacy_inam": Animacy_inam,
|
||||||
|
"Animacy_hum": Animacy_hum, # U20
|
||||||
"Aspect_freq": Aspect_freq,
|
"Aspect_freq": Aspect_freq,
|
||||||
"Aspect_imp": Aspect_imp,
|
"Aspect_imp": Aspect_imp,
|
||||||
"Aspect_mod": Aspect_mod,
|
"Aspect_mod": Aspect_mod,
|
||||||
"Aspect_none": Aspect_none,
|
"Aspect_none": Aspect_none,
|
||||||
"Aspect_perf": Aspect_perf,
|
"Aspect_perf": Aspect_perf,
|
||||||
|
"Aspect_iter": Aspect_iter, # U20
|
||||||
|
"Aspect_hab": Aspect_hab, # U20
|
||||||
"Case_abe": Case_abe,
|
"Case_abe": Case_abe,
|
||||||
"Case_abl": Case_abl,
|
"Case_abl": Case_abl,
|
||||||
"Case_abs": Case_abs,
|
"Case_abs": Case_abs,
|
||||||
|
@ -121,10 +125,12 @@ IDS = {
|
||||||
"Case_all": Case_all,
|
"Case_all": Case_all,
|
||||||
"Case_cau": Case_cau,
|
"Case_cau": Case_cau,
|
||||||
"Case_com": Case_com,
|
"Case_com": Case_com,
|
||||||
|
"Case_cmp": Case_cmp, # U20
|
||||||
"Case_dat": Case_dat,
|
"Case_dat": Case_dat,
|
||||||
"Case_del": Case_del,
|
"Case_del": Case_del,
|
||||||
"Case_dis": Case_dis,
|
"Case_dis": Case_dis,
|
||||||
"Case_ela": Case_ela,
|
"Case_ela": Case_ela,
|
||||||
|
"Case_equ": Case_equ, # U20
|
||||||
"Case_ess": Case_ess,
|
"Case_ess": Case_ess,
|
||||||
"Case_gen": Case_gen,
|
"Case_gen": Case_gen,
|
||||||
"Case_ill": Case_ill,
|
"Case_ill": Case_ill,
|
||||||
|
@ -143,7 +149,9 @@ IDS = {
|
||||||
"Definite_two": Definite_two,
|
"Definite_two": Definite_two,
|
||||||
"Definite_def": Definite_def,
|
"Definite_def": Definite_def,
|
||||||
"Definite_red": Definite_red,
|
"Definite_red": Definite_red,
|
||||||
|
"Definite_cons": Definite_cons, # U20
|
||||||
"Definite_ind": Definite_ind,
|
"Definite_ind": Definite_ind,
|
||||||
|
"Definite_spec": Definite_spec, # U20
|
||||||
"Degree_cmp": Degree_cmp,
|
"Degree_cmp": Degree_cmp,
|
||||||
"Degree_comp": Degree_comp,
|
"Degree_comp": Degree_comp,
|
||||||
"Degree_none": Degree_none,
|
"Degree_none": Degree_none,
|
||||||
|
@ -152,6 +160,8 @@ IDS = {
|
||||||
"Degree_abs": Degree_abs,
|
"Degree_abs": Degree_abs,
|
||||||
"Degree_com": Degree_com,
|
"Degree_com": Degree_com,
|
||||||
"Degree_dim ": Degree_dim, # du
|
"Degree_dim ": Degree_dim, # du
|
||||||
|
"Degree_equ": Degree_equ, # U20
|
||||||
|
"Evident_nfh": Evident_nfh, # U20
|
||||||
"Gender_com": Gender_com,
|
"Gender_com": Gender_com,
|
||||||
"Gender_fem": Gender_fem,
|
"Gender_fem": Gender_fem,
|
||||||
"Gender_masc": Gender_masc,
|
"Gender_masc": Gender_masc,
|
||||||
|
@ -163,16 +173,21 @@ IDS = {
|
||||||
"Mood_pot": Mood_pot,
|
"Mood_pot": Mood_pot,
|
||||||
"Mood_sub": Mood_sub,
|
"Mood_sub": Mood_sub,
|
||||||
"Mood_opt": Mood_opt,
|
"Mood_opt": Mood_opt,
|
||||||
|
"Mood_prp": Mood_prp, # U20
|
||||||
|
"Mood_adm": Mood_adm, # U20
|
||||||
"Negative_neg": Negative_neg,
|
"Negative_neg": Negative_neg,
|
||||||
"Negative_pos": Negative_pos,
|
"Negative_pos": Negative_pos,
|
||||||
"Negative_yes": Negative_yes,
|
"Negative_yes": Negative_yes,
|
||||||
|
"Polarity_neg": Polarity_neg, # U20
|
||||||
|
"Polarity_pos": Polarity_pos, # U20
|
||||||
"Number_com": Number_com,
|
"Number_com": Number_com,
|
||||||
"Number_dual": Number_dual,
|
"Number_dual": Number_dual,
|
||||||
"Number_none": Number_none,
|
"Number_none": Number_none,
|
||||||
"Number_plur": Number_plur,
|
"Number_plur": Number_plur,
|
||||||
"Number_sing": Number_sing,
|
"Number_sing": Number_sing,
|
||||||
"Number_ptan ": Number_ptan, # bg
|
"Number_ptan ": Number_ptan, # bg
|
||||||
"Number_count ": Number_count, # bg
|
"Number_count ": Number_count, # bg, U20
|
||||||
|
"Number_tri": Number_tri, # U20
|
||||||
"NumType_card": NumType_card,
|
"NumType_card": NumType_card,
|
||||||
"NumType_dist": NumType_dist,
|
"NumType_dist": NumType_dist,
|
||||||
"NumType_frac": NumType_frac,
|
"NumType_frac": NumType_frac,
|
||||||
|
@ -198,7 +213,8 @@ IDS = {
|
||||||
"PronType_rel": PronType_rel,
|
"PronType_rel": PronType_rel,
|
||||||
"PronType_tot": PronType_tot,
|
"PronType_tot": PronType_tot,
|
||||||
"PronType_clit": PronType_clit,
|
"PronType_clit": PronType_clit,
|
||||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
"PronType_exc": PronType_exc, # es, ca, it, fa, U20
|
||||||
|
"PronType_emp": PronType_emp, # U20
|
||||||
"Reflex_yes": Reflex_yes,
|
"Reflex_yes": Reflex_yes,
|
||||||
"Tense_fut": Tense_fut,
|
"Tense_fut": Tense_fut,
|
||||||
"Tense_imp": Tense_imp,
|
"Tense_imp": Tense_imp,
|
||||||
|
@ -214,12 +230,17 @@ IDS = {
|
||||||
"VerbForm_partPres": VerbForm_partPres,
|
"VerbForm_partPres": VerbForm_partPres,
|
||||||
"VerbForm_sup": VerbForm_sup,
|
"VerbForm_sup": VerbForm_sup,
|
||||||
"VerbForm_trans": VerbForm_trans,
|
"VerbForm_trans": VerbForm_trans,
|
||||||
|
"VerbForm_conv": VerbForm_conv, # U20
|
||||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||||
|
"VerbForm_vnoun": VerbForm_vnoun, # U20
|
||||||
"Voice_act": Voice_act,
|
"Voice_act": Voice_act,
|
||||||
"Voice_cau": Voice_cau,
|
"Voice_cau": Voice_cau,
|
||||||
"Voice_pass": Voice_pass,
|
"Voice_pass": Voice_pass,
|
||||||
"Voice_mid ": Voice_mid, # gkc,
|
"Voice_mid ": Voice_mid, # gkc, U20
|
||||||
"Voice_int ": Voice_int, # hb,
|
"Voice_int ": Voice_int, # hb,
|
||||||
|
"Voice_antip": Voice_antip, # U20
|
||||||
|
"Voice_dir": Voice_dir, # U20
|
||||||
|
"Voice_inv": Voice_inv, # U20
|
||||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||||
"AdpType_post ": AdpType_post, # U,
|
"AdpType_post ": AdpType_post, # U,
|
||||||
|
@ -285,6 +306,10 @@ IDS = {
|
||||||
"Number_psee_plur ": Number_psee_plur, # U,
|
"Number_psee_plur ": Number_psee_plur, # U,
|
||||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||||
|
"Number_pauc": Number_pauc, # U20
|
||||||
|
"Number_grpa": Number_grpa, # U20
|
||||||
|
"Number_grpl": Number_grpl, # U20
|
||||||
|
"Number_inv": Number_inv, # U20
|
||||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||||
|
@ -312,6 +337,8 @@ IDS = {
|
||||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||||
|
"Person_zero ": Person_zero, # U20
|
||||||
|
"Person_four ": Person_four, # U20
|
||||||
"Polite_inf ": Polite_inf, # bq, U,
|
"Polite_inf ": Polite_inf, # bq, U,
|
||||||
"Polite_pol ": Polite_pol, # bq, U,
|
"Polite_pol ": Polite_pol, # bq, U,
|
||||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||||
|
@ -320,6 +347,10 @@ IDS = {
|
||||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||||
|
"Polite_infm ": Polite_infm, # U20
|
||||||
|
"Polite_form ": Polite_form, # U20
|
||||||
|
"Polite_form_elev ": Polite_form_elev, # U20
|
||||||
|
"Polite_form_humb ": Polite_form_humb, # U20
|
||||||
"Prefix_yes ": Prefix_yes, # U,
|
"Prefix_yes ": Prefix_yes, # U,
|
||||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||||
"PrepCase_pre ": PrepCase_pre, # U,
|
"PrepCase_pre ": PrepCase_pre, # U,
|
||||||
|
@ -384,6 +415,7 @@ IDS = {
|
||||||
"ccomp": ccomp,
|
"ccomp": ccomp,
|
||||||
"complm": complm,
|
"complm": complm,
|
||||||
"conj": conj,
|
"conj": conj,
|
||||||
|
"cop": cop, # U20
|
||||||
"csubj": csubj,
|
"csubj": csubj,
|
||||||
"csubjpass": csubjpass,
|
"csubjpass": csubjpass,
|
||||||
"dep": dep,
|
"dep": dep,
|
||||||
|
@ -406,6 +438,8 @@ IDS = {
|
||||||
"num": num,
|
"num": num,
|
||||||
"number": number,
|
"number": number,
|
||||||
"oprd": oprd,
|
"oprd": oprd,
|
||||||
|
"obj": obj, # U20
|
||||||
|
"obl": obl, # U20
|
||||||
"parataxis": parataxis,
|
"parataxis": parataxis,
|
||||||
"partmod": partmod,
|
"partmod": partmod,
|
||||||
"pcomp": pcomp,
|
"pcomp": pcomp,
|
||||||
|
|
|
@ -124,6 +124,8 @@ cdef class Parser:
|
||||||
elif 'features' not in cfg:
|
elif 'features' not in cfg:
|
||||||
cfg['features'] = self.feature_templates
|
cfg['features'] = self.feature_templates
|
||||||
self.model = ParserModel(cfg['features'])
|
self.model = ParserModel(cfg['features'])
|
||||||
|
self.model.l1_penalty = cfg.get('L1', 0.0)
|
||||||
|
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
@ -258,15 +260,20 @@ cdef class Parser:
|
||||||
self.model.set_featuresC(&eg.c, stcls.c)
|
self.model.set_featuresC(&eg.c, stcls.c)
|
||||||
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
||||||
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
|
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
|
||||||
self.model.updateC(&eg.c)
|
self.model.time += 1
|
||||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||||
|
if eg.c.costs[guess] > 0:
|
||||||
|
best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
||||||
|
for feat in eg.c.features[:eg.c.nr_feat]:
|
||||||
|
self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.c.costs[guess])
|
||||||
|
self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.c.costs[guess])
|
||||||
|
|
||||||
action = self.moves.c[eg.guess]
|
action = self.moves.c[guess]
|
||||||
action.do(stcls.c, action.label)
|
action.do(stcls.c, action.label)
|
||||||
loss += eg.costs[eg.guess]
|
loss += eg.costs[guess]
|
||||||
eg.fill_scores(0, eg.nr_class)
|
eg.fill_scores(0, eg.c.nr_class)
|
||||||
eg.fill_costs(0, eg.nr_class)
|
eg.fill_costs(0, eg.c.nr_class)
|
||||||
eg.fill_is_valid(1, eg.nr_class)
|
eg.fill_is_valid(1, eg.c.nr_class)
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def step_through(self, Doc doc):
|
def step_through(self, Doc doc):
|
||||||
|
@ -385,6 +392,14 @@ class ParserStateError(ValueError):
|
||||||
"Please include the text that the parser failed on, which is:\n"
|
"Please include the text that the parser failed on, which is:\n"
|
||||||
"%s" % repr(doc.text))
|
"%s" % repr(doc.text))
|
||||||
|
|
||||||
|
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil:
|
||||||
|
cdef int best = -1
|
||||||
|
for i in range(n):
|
||||||
|
if costs[i] <= 0:
|
||||||
|
if best == -1 or scores[i] > scores[best]:
|
||||||
|
best = i
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
|
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
|
||||||
int nr_class) except -1:
|
int nr_class) except -1:
|
||||||
|
|
|
@ -13,7 +13,7 @@ from thinc.linalg cimport VecVec
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .attrs cimport TAG
|
from .attrs cimport TAG
|
||||||
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
|
||||||
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||||
from .gold cimport GoldParse
|
from .gold cimport GoldParse
|
||||||
|
|
||||||
|
@ -152,6 +152,7 @@ cdef class Tagger:
|
||||||
model = TaggerModel(cfg.get('features', self.feature_templates))
|
model = TaggerModel(cfg.get('features', self.feature_templates))
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.model.l1_penalty = 0.0
|
||||||
# TODO: Move this to tag map
|
# TODO: Move this to tag map
|
||||||
self.freqs = {TAG: defaultdict(int)}
|
self.freqs = {TAG: defaultdict(int)}
|
||||||
for tag in self.tag_names:
|
for tag in self.tag_names:
|
||||||
|
|
|
@ -4,9 +4,15 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
||||||
def test_issue792(en_tokenizer, text):
|
def test_issue792(en_tokenizer, text):
|
||||||
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
|
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert doc.text_with_ws == text
|
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
||||||
|
def test_control_issue792(en_tokenizer, text):
|
||||||
|
"""Test base case for Issue #792: Non-trailing whitespace"""
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
52
spacy/tests/regression/test_issue850.py
Normal file
52
spacy/tests/regression/test_issue850.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
'''
|
||||||
|
Test Matcher matches with '*' operator and Boolean flag
|
||||||
|
'''
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from __future__ import print_function
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...matcher import Matcher
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...attrs import LOWER
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_basic_case():
|
||||||
|
matcher = Matcher(Vocab(
|
||||||
|
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
||||||
|
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||||
|
matcher.add_pattern(
|
||||||
|
"FarAway",
|
||||||
|
[
|
||||||
|
{LOWER: "bob"},
|
||||||
|
{'OP': '*', LOWER: 'and'},
|
||||||
|
{LOWER: 'frank'}
|
||||||
|
])
|
||||||
|
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
||||||
|
match = matcher(doc)
|
||||||
|
assert len(match) == 1
|
||||||
|
ent_id, label, start, end = match[0]
|
||||||
|
assert start == 0
|
||||||
|
assert end == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_issue850():
|
||||||
|
'''The problem here is that the variable-length pattern matches the
|
||||||
|
succeeding token. We then don't handle the ambiguity correctly.'''
|
||||||
|
matcher = Matcher(Vocab(
|
||||||
|
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
||||||
|
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||||
|
matcher.add_pattern(
|
||||||
|
"FarAway",
|
||||||
|
[
|
||||||
|
{LOWER: "bob"},
|
||||||
|
{'OP': '*', IS_ANY_TOKEN: True},
|
||||||
|
{LOWER: 'frank'}
|
||||||
|
])
|
||||||
|
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
||||||
|
match = matcher(doc)
|
||||||
|
assert len(match) == 1
|
||||||
|
ent_id, label, start, end = match[0]
|
||||||
|
assert start == 0
|
||||||
|
assert end == 4
|
12
spacy/tests/regression/test_issue859.py
Normal file
12
spacy/tests/regression/test_issue859.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
|
||||||
|
"aaabbb@ccc.com \nThank you!"])
|
||||||
|
def test_issue859(en_tokenizer, text):
|
||||||
|
"""Test that no extra space is added in doc.text method."""
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert doc.text == text
|
40
spacy/tests/test_pickles.py
Normal file
40
spacy/tests/test_pickles.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import io
|
||||||
|
import pytest
|
||||||
|
import dill as pickle
|
||||||
|
|
||||||
|
from ..strings import StringStore
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ..attrs import NORM
|
||||||
|
|
||||||
|
|
||||||
|
def test_pickle_string_store():
|
||||||
|
sstore = StringStore()
|
||||||
|
hello = sstore['hello']
|
||||||
|
bye = sstore['bye']
|
||||||
|
bdata = pickle.dumps(sstore, protocol=-1)
|
||||||
|
unpickled = pickle.loads(bdata)
|
||||||
|
assert unpickled['hello'] == hello
|
||||||
|
assert unpickled['bye'] == bye
|
||||||
|
assert len(sstore) == len(unpickled)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_pickle_vocab():
|
||||||
|
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
|
||||||
|
dog = vocab[u'dog']
|
||||||
|
cat = vocab[u'cat']
|
||||||
|
assert dog.norm_ == 'do'
|
||||||
|
assert cat.norm_ == 'ca'
|
||||||
|
|
||||||
|
bdata = pickle.dumps(vocab)
|
||||||
|
unpickled = pickle.loads(bdata)
|
||||||
|
|
||||||
|
assert unpickled[u'dog'].orth == dog.orth
|
||||||
|
assert unpickled[u'cat'].orth == cat.orth
|
||||||
|
assert unpickled[u'dog'].norm == dog.norm
|
||||||
|
assert unpickled[u'cat'].norm == cat.norm
|
||||||
|
dog_ = unpickled[u'dog']
|
||||||
|
cat_ = unpickled[u'cat']
|
||||||
|
assert dog_.norm != cat_.norm
|
|
@ -163,7 +163,6 @@ cdef class Tokenizer:
|
||||||
start = i
|
start = i
|
||||||
in_ws = not in_ws
|
in_ws = not in_ws
|
||||||
i += 1
|
i += 1
|
||||||
i += 1
|
|
||||||
if start < i:
|
if start < i:
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
|
@ -275,7 +274,10 @@ cdef class Tokenizer:
|
||||||
if cache_hit:
|
if cache_hit:
|
||||||
pass
|
pass
|
||||||
elif self.token_match and self.token_match(string):
|
elif self.token_match and self.token_match(string):
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
|
# fix up the outermost one, with reference to the original.
|
||||||
|
# See Issue #859
|
||||||
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
else:
|
else:
|
||||||
matches = self.find_infix(string)
|
matches = self.find_infix(string)
|
||||||
if not matches:
|
if not matches:
|
||||||
|
|
|
@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
|
|
|
@ -20,7 +20,7 @@ from .. import parts_of_speech
|
||||||
from ..attrs cimport LEMMA
|
from ..attrs cimport LEMMA
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP
|
from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT
|
from ..parts_of_speech cimport CCONJ, PUNCT
|
||||||
|
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_BRACKET
|
from ..attrs cimport IS_BRACKET
|
||||||
|
|
116
spacy/vocab.pyx
116
spacy/vocab.pyx
|
@ -9,11 +9,16 @@ import bz2
|
||||||
import ujson as json
|
import ujson as json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cPickle as pickle
|
||||||
|
except ImportError:
|
||||||
|
import pickle
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .cfile cimport CFile
|
from .cfile cimport CFile, StringCFile
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .tokens.token cimport Token
|
from .tokens.token cimport Token
|
||||||
|
@ -346,17 +351,18 @@ cdef class Vocab:
|
||||||
Token.set_struct_attr(token, attr_id, value)
|
Token.set_struct_attr(token, attr_id, value)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc=None):
|
||||||
"""Save the lexemes binary data to the given location.
|
"""Save the lexemes binary data to the given location, or
|
||||||
|
return a byte-string with the data if loc is None.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loc (Path): The path to save to.
|
loc (Path or None): The path to save to, or None.
|
||||||
"""
|
"""
|
||||||
if hasattr(loc, 'as_posix'):
|
cdef CFile fp
|
||||||
loc = loc.as_posix()
|
if loc is None:
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
fp = StringCFile('wb')
|
||||||
|
else:
|
||||||
cdef CFile fp = CFile(bytes_loc, 'wb')
|
fp = CFile(loc, 'wb')
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
|
@ -378,6 +384,8 @@ cdef class Vocab:
|
||||||
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
|
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
|
||||||
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
|
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
if loc is None:
|
||||||
|
return fp.string_data()
|
||||||
|
|
||||||
def load_lexemes(self, loc):
|
def load_lexemes(self, loc):
|
||||||
'''Load the binary vocabulary data from the given location.
|
'''Load the binary vocabulary data from the given location.
|
||||||
|
@ -427,6 +435,60 @@ cdef class Vocab:
|
||||||
i += 1
|
i += 1
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
|
def _deserialize_lexemes(self, CFile fp):
|
||||||
|
'''Load the binary vocabulary data from the given CFile.
|
||||||
|
'''
|
||||||
|
cdef LexemeC* lexeme
|
||||||
|
cdef hash_t key
|
||||||
|
cdef unicode py_str
|
||||||
|
cdef attr_t orth
|
||||||
|
assert sizeof(orth) == sizeof(lexeme.orth)
|
||||||
|
i = 0
|
||||||
|
cdef int todo = fp.size
|
||||||
|
cdef int lex_size = sizeof(lexeme.flags)
|
||||||
|
lex_size += sizeof(lexeme.id)
|
||||||
|
lex_size += sizeof(lexeme.length)
|
||||||
|
lex_size += sizeof(lexeme.orth)
|
||||||
|
lex_size += sizeof(lexeme.lower)
|
||||||
|
lex_size += sizeof(lexeme.norm)
|
||||||
|
lex_size += sizeof(lexeme.shape)
|
||||||
|
lex_size += sizeof(lexeme.prefix)
|
||||||
|
lex_size += sizeof(lexeme.suffix)
|
||||||
|
lex_size += sizeof(lexeme.cluster)
|
||||||
|
lex_size += sizeof(lexeme.prob)
|
||||||
|
lex_size += sizeof(lexeme.sentiment)
|
||||||
|
lex_size += sizeof(lexeme.l2_norm)
|
||||||
|
lex_size += sizeof(lexeme.lang)
|
||||||
|
while True:
|
||||||
|
if todo < lex_size:
|
||||||
|
break
|
||||||
|
todo -= lex_size
|
||||||
|
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
|
# Copy data from the file into the lexeme
|
||||||
|
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
|
||||||
|
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
|
||||||
|
fp.read_into(&lexeme.length, 1, sizeof(lexeme.length))
|
||||||
|
fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth))
|
||||||
|
fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower))
|
||||||
|
fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm))
|
||||||
|
fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape))
|
||||||
|
fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix))
|
||||||
|
fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix))
|
||||||
|
fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster))
|
||||||
|
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
|
||||||
|
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
|
||||||
|
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
||||||
|
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
|
||||||
|
|
||||||
|
lexeme.vector = EMPTY_VEC
|
||||||
|
py_str = self.strings[lexeme.orth]
|
||||||
|
key = hash_string(py_str)
|
||||||
|
self._by_hash.set(key, lexeme)
|
||||||
|
self._by_orth.set(lexeme.orth, lexeme)
|
||||||
|
self.length += 1
|
||||||
|
i += 1
|
||||||
|
fp.close()
|
||||||
|
|
||||||
def dump_vectors(self, out_loc):
|
def dump_vectors(self, out_loc):
|
||||||
'''Save the word vectors to a binary file.
|
'''Save the word vectors to a binary file.
|
||||||
|
|
||||||
|
@ -553,6 +615,42 @@ cdef class Vocab:
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
|
|
||||||
|
def pickle_vocab(vocab):
|
||||||
|
sstore = vocab.strings
|
||||||
|
morph = vocab.morphology
|
||||||
|
length = vocab.length
|
||||||
|
serializer = vocab._serializer
|
||||||
|
data_dir = vocab.data_dir
|
||||||
|
lex_attr_getters = vocab.lex_attr_getters
|
||||||
|
|
||||||
|
lexemes_data = vocab.dump()
|
||||||
|
vectors_length = vocab.vectors_length
|
||||||
|
|
||||||
|
return (unpickle_vocab,
|
||||||
|
(sstore, morph, serializer, data_dir, lex_attr_getters,
|
||||||
|
lexemes_data, length, vectors_length))
|
||||||
|
|
||||||
|
|
||||||
|
def unpickle_vocab(sstore, morphology, serializer, data_dir,
|
||||||
|
lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
|
||||||
|
cdef Vocab vocab = Vocab()
|
||||||
|
vocab.length = length
|
||||||
|
vocab.vectors_length = vectors_length
|
||||||
|
vocab.strings = sstore
|
||||||
|
cdef CFile fp = StringCFile('r', data=lexemes_data)
|
||||||
|
vocab.morphology = morphology
|
||||||
|
vocab._serializer = serializer
|
||||||
|
vocab.data_dir = data_dir
|
||||||
|
vocab.lex_attr_getters = lex_attr_getters
|
||||||
|
vocab._deserialize_lexemes(fp)
|
||||||
|
vocab.length = length
|
||||||
|
vocab.vectors_length = vectors_length
|
||||||
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
|
||||||
|
|
||||||
|
|
||||||
def write_binary_vectors(in_loc, out_loc):
|
def write_binary_vectors(in_loc, out_loc):
|
||||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||||
cdef Address mem
|
cdef Address mem
|
||||||
|
|
Loading…
Reference in New Issue
Block a user