mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
* More work on language-generic parsing
This commit is contained in:
parent
86c4a8e3e2
commit
c2307fa9ee
11
spacy/fi/__init__.py
Normal file
11
spacy/fi/__init__.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
@classmethod
|
||||
def default_data_dir(cls):
|
||||
return path.join(path.dirname(__file__), 'data')
|
|
@ -148,13 +148,10 @@ class Language(object):
|
|||
vectors = cls.default_vectors(data_dir)
|
||||
if get_lex_attr is None:
|
||||
get_lex_attr = cls.default_lex_attrs(data_dir)
|
||||
if morphology is None:
|
||||
morphology = cls.default_morphology(path.join(data_dir, 'vocab'))
|
||||
return Vocab.from_dir(
|
||||
path.join(data_dir, 'vocab'),
|
||||
get_lex_attr=get_lex_attr,
|
||||
vectors=vectors,
|
||||
morphology=morphology)
|
||||
vectors=vectors)
|
||||
|
||||
@classmethod
|
||||
def default_tokenizer(cls, vocab, data_dir):
|
||||
|
|
|
@ -1,18 +1,41 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMapArray
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .strings cimport StringStore
|
||||
from .typedefs cimport attr_t
|
||||
from .parts_of_speech cimport univ_pos_t
|
||||
|
||||
|
||||
cdef struct RichTagC:
|
||||
uint64_t morph
|
||||
int id
|
||||
univ_pos_t pos
|
||||
attr_t name
|
||||
|
||||
|
||||
cdef struct MorphAnalysisC:
|
||||
RichTagC tag
|
||||
attr_t lemma
|
||||
|
||||
|
||||
cdef class Morphology:
|
||||
cdef readonly Pool mem
|
||||
cdef readonly object strings
|
||||
cdef public object lemmatizer
|
||||
cdef public object tag_map
|
||||
cdef public object n_tags
|
||||
cdef public object reverse_index
|
||||
cdef public object tag_names
|
||||
cdef public object tag_ids
|
||||
cdef public int n_tags
|
||||
|
||||
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
|
||||
cdef RichTagC* rich_tags
|
||||
cdef PreshMapArray _cache
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||
|
||||
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
||||
|
||||
|
||||
cdef int assign_from_dict(self, TokenC* token, props) except -1
|
||||
|
||||
#
|
||||
#cpdef enum Feature_t:
|
||||
|
|
|
@ -6,15 +6,10 @@ try:
|
|||
except ImportError:
|
||||
import json
|
||||
|
||||
from spacy.parts_of_speech import UNIV_POS_NAMES
|
||||
from .parts_of_speech import UNIV_POS_NAMES
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN
|
||||
|
||||
|
||||
cdef struct MorphAnalysisC:
|
||||
uint64_t[4] features
|
||||
attr_t lemma
|
||||
attr_t pos
|
||||
|
||||
|
||||
cdef class Morphology:
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, lemmatizer=None):
|
||||
|
@ -23,32 +18,37 @@ cdef class Morphology:
|
|||
lemmatizer = Lemmatizer.from_dir(data_dir)
|
||||
return cls(tag_map, {}, lemmatizer)
|
||||
|
||||
def __init__(self, tag_map, fused_tokens, lemmatizer):
|
||||
def __init__(self, string_store, tag_map, lemmatizer):
|
||||
self.mem = Pool()
|
||||
self.strings = string_store
|
||||
self.lemmatizer = lemmatizer
|
||||
self.tag_map = tag_map
|
||||
self.n_tags = len(tag_map)
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.tag_ids = {}
|
||||
for i, tag_str in enumerate(self.tag_names):
|
||||
self.tag_ids[tag_str] = i
|
||||
self._cache = PreshMapArray()
|
||||
self.reverse_index = {}
|
||||
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
||||
self.rich_tags[i].id = i
|
||||
self.rich_tags[i].name = self.strings[tag_str]
|
||||
self.rich_tags[i].morph = 0
|
||||
self.reverse_index[self.rich_tags[i].name] = i
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
analysis = <MorphAnalysisC*>self._cache.get(tag, token.lex.orth)
|
||||
cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
|
||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||
if analysis is NULL:
|
||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||
cached = self.decode_tag(tag)
|
||||
cached.lemma = self.lemmatize(token.pos, token.lex)
|
||||
analysis.tag = self.rich_tags[tag_id]
|
||||
analysis.lemma = self.lemmatize(tag, token.lex.orth)
|
||||
token.lemma = analysis.lemma
|
||||
token.pos = analysis.pos
|
||||
token.tag = analysis.tag
|
||||
token.morph = analysis.features
|
||||
token.pos = analysis.tag.pos
|
||||
token.tag = analysis.tag.name
|
||||
token.morph = analysis.tag.morph
|
||||
|
||||
cdef int assign_feature(self, TokenC* token, feature, value) except -1:
|
||||
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
|
||||
pass
|
||||
|
||||
def load_morph_exceptions(self, dict exc):
|
||||
# Map (form, pos) to (lemma, inflection)
|
||||
# Map (form, pos) to (lemma, rich tag)
|
||||
cdef unicode pos_str
|
||||
cdef unicode form_str
|
||||
cdef unicode lemma_str
|
||||
|
@ -57,121 +57,30 @@ cdef class Morphology:
|
|||
cdef int lemma
|
||||
cdef attr_t orth
|
||||
cdef int pos
|
||||
for pos_str, entries in exc.items():
|
||||
pos = self.tag_names.index(pos_str)
|
||||
for tag_str, entries in exc.items():
|
||||
tag = self.strings[tag_str]
|
||||
rich_tag = self.rich_tags[self.reverse_index[tag]]
|
||||
for form_str, props in entries.items():
|
||||
lemma_str = props.get('L', form_str)
|
||||
orth = self.strings[form_str]
|
||||
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||
cached.lemma = self.strings[lemma_str]
|
||||
self.set_features(cached, props)
|
||||
self._cache.set(pos, orth, <void*>cached)
|
||||
orth = self.strings[form_str]
|
||||
for name_str, value_str in props.items():
|
||||
if name_str == 'L':
|
||||
cached.lemma = self.strings[value_str]
|
||||
else:
|
||||
self.assign_feature(&cached.tag.morph, name_str, value_str)
|
||||
if cached.lemma == 0:
|
||||
cached.lemma = self.lemmatize(rich_tag.pos, orth)
|
||||
self._cache.set(rich_tag.pos, orth, <void*>cached)
|
||||
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
'''Add a special-case tokenization rule.
|
||||
'''
|
||||
cdef int i
|
||||
cdef list substrings
|
||||
cdef unicode chunk
|
||||
cdef unicode form
|
||||
cdef unicode lemma
|
||||
cdef dict props
|
||||
cdef LexemeC** lexemes
|
||||
cdef hash_t hashed
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
# Set the special tokens up to have morphology and lemmas if
|
||||
# specified, otherwise use the part-of-speech tag (if specified)
|
||||
form = props['F']
|
||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
|
||||
morphology = self.vocab.morphology.decode_dict(props)
|
||||
tokens[i].lemma = morph_analysis.lemma
|
||||
tokens[i].pos = morph_analysis.pos
|
||||
tokens[i].tag = morph_analysis.tag
|
||||
tokens[i].morph = morph_analysis.morph
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.length = len(substrings)
|
||||
cached.is_lex = False
|
||||
cached.data.tokens = tokens
|
||||
hashed = hash_string(chunk)
|
||||
self._specials.set(hashed, cached)
|
||||
self._cache.set(hashed, cached)
|
||||
|
||||
|
||||
|
||||
|
||||
#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
# morph.number = props.get('number', 0)
|
||||
# morph.tenspect = props.get('tenspect', 0)
|
||||
# morph.mood = props.get('mood', 0)
|
||||
# morph.gender = props.get('gender', 0)
|
||||
# morph.person = props.get('person', 0)
|
||||
# morph.case = props.get('case', 0)
|
||||
# morph.misc = props.get('misc', 0)
|
||||
#
|
||||
#
|
||||
#cdef class Morphology:
|
||||
# cdef Pool mem
|
||||
# cdef PreshMap table
|
||||
#
|
||||
# def __init__(self, tags, exceptions):
|
||||
# pass
|
||||
#
|
||||
# def __getitem__(self, hash_t id_):
|
||||
# pass
|
||||
#
|
||||
# cdef const InflectionC* get(self, hash_t key) except NULL:
|
||||
# pass
|
||||
#
|
||||
# cdef MorphAnalysis analyse(const TokenC* token) except -1:
|
||||
# cdef struct MorphAnalysis morphology
|
||||
# tokens[i].pos = tag.pos
|
||||
# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
|
||||
# if cached is NULL:
|
||||
# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||
# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
# cached.morph = tag.morph
|
||||
# self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
|
||||
# tokens[i].lemma = cached.lemma
|
||||
# tokens[i].morph = cached.morph
|
||||
#
|
||||
# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
|
||||
# if self.lemmatizer is None:
|
||||
# return lex.orth
|
||||
# cdef unicode py_string = self.strings[lex.orth]
|
||||
# if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
# return lex.orth
|
||||
# cdef set lemma_strings
|
||||
# cdef unicode lemma_string
|
||||
# lemma_strings = self.lemmatizer(py_string, pos)
|
||||
# lemma_string = sorted(lemma_strings)[0]
|
||||
# lemma = self.strings[lemma_string]
|
||||
# return lemma
|
||||
#
|
||||
#
|
||||
#cdef class Inflection:
|
||||
# cdef InflectionC* c
|
||||
#
|
||||
# def __init__(self, container, id_):
|
||||
# self.c = container[id_]
|
||||
# self.container = container
|
||||
#
|
||||
# for i, feat_id in enumerate(feat_ids):
|
||||
# feature, value = parse_id(feat_id)
|
||||
# self.add_value(feature, value, True)
|
||||
#
|
||||
# def has(self, Value_t feat_value_id):
|
||||
# part = feat_value_id % 64
|
||||
# bit = feat_value_id / 64
|
||||
# if self.value_set[part] & bit:
|
||||
# return True
|
||||
# else:
|
||||
# return False
|
||||
#
|
||||
# property pos: def __get__(self): return self.c.pos
|
||||
#
|
||||
# property id: def __get__(self): return self.c.id
|
||||
#
|
||||
# property features:
|
||||
# pass
|
||||
def lemmatize(self, const univ_pos_t pos, attr_t orth):
|
||||
if self.lemmatizer is None:
|
||||
return orth
|
||||
cdef unicode py_string = self.strings[orth]
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return orth
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, pos)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings[lemma_string]
|
||||
return lemma
|
||||
|
|
|
@ -25,17 +25,6 @@ cdef struct LexemeC:
|
|||
float sentiment
|
||||
float l2_norm
|
||||
|
||||
cdef struct MorphFeatC:
|
||||
int name
|
||||
int value
|
||||
|
||||
|
||||
cdef struct MorphologyC:
|
||||
uint64_t[4] feature_set
|
||||
MorphFeatC* features
|
||||
univ_pos_t pos
|
||||
int n
|
||||
|
||||
|
||||
cdef struct Entity:
|
||||
int start
|
||||
|
@ -54,8 +43,8 @@ cdef struct Constituent:
|
|||
|
||||
cdef struct TokenC:
|
||||
const LexemeC* lex
|
||||
const MorphologyC* morph
|
||||
const Constituent* ctnt
|
||||
uint64_t morph
|
||||
univ_pos_t pos
|
||||
bint spacy
|
||||
int tag
|
||||
|
|
|
@ -104,7 +104,7 @@ cdef class Tagger:
|
|||
|
||||
@classmethod
|
||||
def blank(cls, vocab, templates):
|
||||
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
||||
model = Model(vocab.n_tags, templates, model_loc=None)
|
||||
return cls(vocab, model)
|
||||
|
||||
@classmethod
|
||||
|
@ -113,7 +113,7 @@ cdef class Tagger:
|
|||
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
||||
else:
|
||||
templates = cls.default_templates()
|
||||
model = Model(vocab.morphology.n_tags, templates, data_dir)
|
||||
model = Model(vocab.n_tags, templates, data_dir)
|
||||
return cls(vocab, model)
|
||||
|
||||
def __init__(self, Vocab vocab, model):
|
||||
|
@ -128,7 +128,7 @@ cdef class Tagger:
|
|||
|
||||
@property
|
||||
def tag_names(self):
|
||||
return self.vocab.morphology.tag_names
|
||||
return self.vocab.tag_names
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
@ -143,14 +143,15 @@ cdef class Tagger:
|
|||
for i in range(tokens.length):
|
||||
if tokens.data[i].pos == 0:
|
||||
guess = self.predict(i, tokens.data)
|
||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
||||
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
|
@ -168,7 +169,9 @@ cdef class Tagger:
|
|||
for i in range(tokens.length):
|
||||
guess = self.update(i, tokens.data, golds[i])
|
||||
loss = golds[i] != -1 and guess != golds[i]
|
||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
|
||||
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
||||
|
||||
correct += loss == 0
|
||||
self.freqs[TAG][tokens.data[i].tag] += 1
|
||||
return correct
|
||||
|
|
|
@ -7,12 +7,7 @@ from .typedefs cimport hash_t
|
|||
from .structs cimport LexemeC, TokenC
|
||||
from .strings cimport StringStore
|
||||
from .tokens.doc cimport Doc
|
||||
from .vocab cimport Vocab, _Cached
|
||||
|
||||
|
||||
cdef union LexemesOrTokens:
|
||||
const LexemeC* const* lexemes
|
||||
TokenC* tokens
|
||||
from .vocab cimport Vocab, LexemesOrTokens, _Cached
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
|
|
@ -192,9 +192,7 @@ cdef class Tokenizer:
|
|||
tokens.push_back(prefixes[0][i], False)
|
||||
if string:
|
||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||
if cache_hit:
|
||||
pass
|
||||
else:
|
||||
if not cache_hit:
|
||||
match = self.find_infix(string)
|
||||
if match is None:
|
||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||
|
@ -253,38 +251,10 @@ cdef class Tokenizer:
|
|||
cdef LexemeC** lexemes
|
||||
cdef hash_t hashed
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
form = props['F']
|
||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
|
||||
lemma = props.get('L', form)
|
||||
tokens[i].lemma = self.vocab.strings[lemma]
|
||||
#TODO
|
||||
#self.vocab.morphology.assign_from_dict(&tokens[i], props)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.length = len(substrings)
|
||||
cached.is_lex = False
|
||||
cached.data.tokens = tokens
|
||||
hashed = hash_string(chunk)
|
||||
self._specials.set(hashed, cached)
|
||||
self._cache.set(hashed, cached)
|
||||
|
||||
|
||||
#if lemma is not None:
|
||||
# tokens[i].lemma = self.vocab.strings[lemma]
|
||||
#else:
|
||||
# tokens[i].lemma = 0
|
||||
#if 'pos' in props:
|
||||
# inflection = self.vocab.morphology.get(props['pos'])
|
||||
# inflection.assign(&tokens[i])
|
||||
# # These are defaults, which can be over-ridden by the
|
||||
# # token-specific props.
|
||||
# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
|
||||
# #tokens[i].pos = pos
|
||||
# ## These are defaults, which can be over-ridden by the
|
||||
# ## token-specific props.
|
||||
# #set_morph_from_dict(&tokens[i].morph, morph_features)
|
||||
# #if tokens[i].lemma == 0:
|
||||
# # tokens[i].lemma = tokens[i].lex.orth
|
||||
##set_morph_from_dict(&tokens[i].morph, props)
|
||||
|
||||
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
||||
key = hash_string(chunk)
|
||||
self._specials.set(key, cached)
|
||||
self._cache.set(key, cached)
|
||||
|
|
|
@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
|||
|
||||
|
||||
ctypedef const LexemeC* const_Lexeme_ptr
|
||||
ctypedef TokenC* TokenC_ptr
|
||||
ctypedef const TokenC* const_TokenC_ptr
|
||||
|
||||
ctypedef fused LexemeOrToken:
|
||||
const_Lexeme_ptr
|
||||
TokenC_ptr
|
||||
const_TokenC_ptr
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
|
|
|
@ -209,7 +209,7 @@ cdef class Doc:
|
|||
if self.length == self.max_length:
|
||||
self._realloc(self.length * 2)
|
||||
cdef TokenC* t = &self.data[self.length]
|
||||
if LexemeOrToken is TokenC_ptr:
|
||||
if LexemeOrToken is const_TokenC_ptr:
|
||||
t[0] = lex_or_tok[0]
|
||||
else:
|
||||
t.lex = lex_or_tok
|
||||
|
|
|
@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME
|
|||
|
||||
cdef union LexemesOrTokens:
|
||||
const LexemeC* const* lexemes
|
||||
TokenC* tokens
|
||||
const TokenC* tokens
|
||||
|
||||
|
||||
cdef struct _Cached:
|
||||
|
@ -37,6 +37,7 @@ cdef class Vocab:
|
|||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
|
|
|
@ -17,6 +17,7 @@ from .strings cimport hash_string
|
|||
from .orth cimport word_shape
|
||||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from . import util
|
||||
|
@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
|
|||
cdef class Vocab:
|
||||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
@classmethod
|
||||
def default_morphology(cls):
|
||||
return Morphology({'VBZ': ['VERB', {}]}, [], None)
|
||||
|
||||
def __init__(self, get_lex_attr=None, morphology=None, vectors=None):
|
||||
self.get_lex_attr = get_lex_attr
|
||||
if morphology is None:
|
||||
morphology = self.default_morphology()
|
||||
self.morphology = morphology
|
||||
|
||||
def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
self.get_lex_attr = get_lex_attr
|
||||
self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
|
||||
|
||||
self.length = 1
|
||||
self._serializer = None
|
||||
|
@ -60,10 +54,9 @@ cdef class Vocab:
|
|||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors,
|
||||
morphology=morphology)
|
||||
self.load_lexemes(path.join(data_dir, 'strings.txt'),
|
||||
path.join(data_dir, 'lexemes.bin'))
|
||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
|
||||
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
|
||||
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||
return self
|
||||
|
@ -172,6 +165,22 @@ cdef class Vocab:
|
|||
orth = id_or_string
|
||||
return Lexeme(self, orth)
|
||||
|
||||
cdef const TokenC* make_fused_token(self, substrings) except NULL:
|
||||
cdef int i
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
token = &tokens[i]
|
||||
# Set the special tokens up to have morphology and lemmas if
|
||||
# specified, otherwise use the part-of-speech tag (if specified)
|
||||
token.lex = <LexemeC*>self.get(self.mem, props['F'])
|
||||
if 'pos' in props:
|
||||
self.morphology.assign_tag(token, props['pos'])
|
||||
if 'L' in props:
|
||||
tokens[i].lemma = self.strings[props['L']]
|
||||
for feature, value in props.get('morph', {}).items():
|
||||
self.morphology.assign_feature(&token.morph, feature, value)
|
||||
return tokens
|
||||
|
||||
def dump(self, loc):
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc)
|
||||
|
|
Loading…
Reference in New Issue
Block a user