* Work on new morphology organization

This commit is contained in:
Matthew Honnibal 2015-08-27 23:11:51 +02:00
parent 5b89e2454c
commit 86c4a8e3e2

View File

@ -9,6 +9,12 @@ except ImportError:
from spacy.parts_of_speech import UNIV_POS_NAMES from spacy.parts_of_speech import UNIV_POS_NAMES
cdef struct MorphAnalysisC:
uint64_t[4] features
attr_t lemma
attr_t pos
cdef class Morphology: cdef class Morphology:
@classmethod @classmethod
def from_dir(cls, data_dir, lemmatizer=None): def from_dir(cls, data_dir, lemmatizer=None):
@ -25,39 +31,74 @@ cdef class Morphology:
self.tag_ids = {} self.tag_ids = {}
for i, tag_str in enumerate(self.tag_names): for i, tag_str in enumerate(self.tag_names):
self.tag_ids[tag_str] = i self.tag_ids[tag_str] = i
self._cache = PreshMapArray()
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
# TODO Caching analysis = <MorphAnalysisC*>self._cache.get(tag, token.lex.orth)
props = self.tag_map[self.tag_names[tag]] if analysis is NULL:
token.pos = UNIV_POS_NAMES[props['pos'].upper()] analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
token.tag = strings[self.tag_names[tag]] cached = self.decode_tag(tag)
lemma = self.lemmatizer(strings[token.lex.orth], token.pos) cached.lemma = self.lemmatize(token.pos, token.lex)
token.lemma = strings[lemma] token.lemma = analysis.lemma
#token.inflection = # TODO token.pos = analysis.pos
token.tag = analysis.tag
token.morph = analysis.features
cdef int assign_from_dict(self, TokenC* token, props) except -1: cdef int assign_feature(self, TokenC* token, feature, value) except -1:
pass pass
def load_morph_exceptions(self, dict exc): def load_morph_exceptions(self, dict exc):
pass
# Map (form, pos) to (lemma, inflection) # Map (form, pos) to (lemma, inflection)
#cdef unicode pos_str cdef unicode pos_str
#cdef unicode form_str cdef unicode form_str
#cdef unicode lemma_str cdef unicode lemma_str
#cdef dict entries cdef dict entries
#cdef dict props cdef dict props
#cdef int lemma cdef int lemma
#cdef attr_t orth cdef attr_t orth
#cdef int pos cdef int pos
#for pos_str, entries in exc.items(): for pos_str, entries in exc.items():
# pos = self.tag_names.index(pos_str) pos = self.tag_names.index(pos_str)
# for form_str, props in entries.items(): for form_str, props in entries.items():
# lemma_str = props.get('L', form_str) lemma_str = props.get('L', form_str)
# orth = self.strings[form_str] orth = self.strings[form_str]
# cached = <InflectedLemma*>self.mem.alloc(1, sizeof(InflectedLemma)) cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
# cached.lemma = self.strings[lemma_str] cached.lemma = self.strings[lemma_str]
# set_morph_from_dict(&cached.morph, props) self.set_features(cached, props)
# self._morph_cache.set(pos, orth, <void*>cached) self._cache.set(pos, orth, <void*>cached)
def _load_special_tokenization(self, special_cases):
'''Add a special-case tokenization rule.
'''
cdef int i
cdef list substrings
cdef unicode chunk
cdef unicode form
cdef unicode lemma
cdef dict props
cdef LexemeC** lexemes
cdef hash_t hashed
for chunk, substrings in sorted(special_cases.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
# Set the special tokens up to have morphology and lemmas if
# specified, otherwise use the part-of-speech tag (if specified)
form = props['F']
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
morphology = self.vocab.morphology.decode_dict(props)
tokens[i].lemma = morph_analysis.lemma
tokens[i].pos = morph_analysis.pos
tokens[i].tag = morph_analysis.tag
tokens[i].morph = morph_analysis.morph
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = tokens
hashed = hash_string(chunk)
self._specials.set(hashed, cached)
self._cache.set(hashed, cached)
#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: #cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: