From b9ade7d4e090a2bd20626aa96ffa310031871c23 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 7 Mar 2019 14:03:07 +0100 Subject: [PATCH] Add MorphAnalysisC struct --- spacy/morphology.pxd | 51 +---- spacy/morphology.pyx | 26 +-- spacy/structs.pxd | 46 ++++ spacy/tokens/morphanalysis.pyx | 371 +++++++++++++++++++++++++++++++-- 4 files changed, 420 insertions(+), 74 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index adc5e5574..24e54bdee 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -3,7 +3,7 @@ from preshed.maps cimport PreshMap, PreshMapArray from libc.stdint cimport uint64_t from murmurhash cimport mrmr -from .structs cimport TokenC +from .structs cimport TokenC, MorphAnalysisC from .strings cimport StringStore from .typedefs cimport hash_t, attr_t, flags_t from .parts_of_speech cimport univ_pos_t @@ -24,7 +24,7 @@ cdef class Morphology: cdef readonly int n_tags cpdef update(self, hash_t morph, features) - cdef hash_t insert(self, RichTagC tag) except 0 + cdef hash_t insert(self, MorphAnalysisC tag) except 0 cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1 @@ -416,50 +416,3 @@ cdef enum univ_morph_t: Voice_int # hb end_Voice - -cdef struct RichTagC: - univ_pos_t pos - - univ_morph_t abbr - univ_morph_t adp_type - univ_morph_t adv_type - univ_morph_t animacy - univ_morph_t aspect - univ_morph_t case - univ_morph_t conj_type - univ_morph_t connegative - univ_morph_t definite - univ_morph_t degree - univ_morph_t derivation - univ_morph_t echo - univ_morph_t foreign - univ_morph_t gender - univ_morph_t hyph - univ_morph_t inf_form - univ_morph_t mood - univ_morph_t negative - univ_morph_t number - univ_morph_t name_type - univ_morph_t noun_type - univ_morph_t num_form - univ_morph_t num_type - univ_morph_t num_value - univ_morph_t part_form - univ_morph_t part_type - univ_morph_t person - univ_morph_t polite - univ_morph_t polarity - univ_morph_t poss - univ_morph_t prefix - univ_morph_t prep_case - univ_morph_t pron_type - univ_morph_t punct_side - univ_morph_t punct_type - univ_morph_t reflex - univ_morph_t style - univ_morph_t style_variant - univ_morph_t tense - univ_morph_t typo - univ_morph_t verb_form - univ_morph_t voice - univ_morph_t verb_type diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 40c7f66af..52acfedfb 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -111,13 +111,13 @@ cdef class Morphology: print(list(NAMES.keys())[:10]) print(NAMES.get(feature-1), NAMES.get(feature+1)) raise KeyError("Unknown feature: %d" % feature) - cdef RichTagC tag + cdef MorphAnalysisC tag tag = create_rich_tag(features) cdef hash_t key = self.insert(tag) return key def get(self, hash_t morph): - tag = self.tags.get(morph) + tag = self.tags.get(morph) if tag == NULL: return [] else: @@ -125,7 +125,7 @@ cdef class Morphology: cpdef update(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" - tag = (self.tags.get(morph))[0] + tag = (self.tags.get(morph))[0] features = intify_features(features) cdef univ_morph_t feature for feature in features: @@ -168,10 +168,10 @@ cdef class Morphology: attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.exc[(tag_str, self.strings.add(orth_str))] = attrs - cdef hash_t insert(self, RichTagC tag) except 0: + cdef hash_t insert(self, MorphAnalysisC tag) except 0: cdef hash_t key = hash_tag(tag) if self.tags.get(key) == NULL: - tag_ptr = self.mem.alloc(1, sizeof(RichTagC)) + tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC)) tag_ptr[0] = tag self.tags.set(key, tag_ptr) return key @@ -240,7 +240,7 @@ cdef class Morphology: def to_bytes(self): json_tags = [] for key in self.tags: - tag_ptr = self.tags.get(key) + tag_ptr = self.tags.get(key) if tag_ptr != NULL: json_tags.append(tag_to_json(tag_ptr[0])) return srsly.json_dumps(json_tags) @@ -261,18 +261,18 @@ cpdef univ_pos_t get_int_tag(pos_): cpdef intify_features(features): return {IDS.get(feature, feature) for feature in features} -cdef hash_t hash_tag(RichTagC tag) nogil: +cdef hash_t hash_tag(MorphAnalysisC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) -cdef RichTagC create_rich_tag(features) except *: - cdef RichTagC tag +cdef MorphAnalysisC create_rich_tag(features) except *: + cdef MorphAnalysisC tag cdef univ_morph_t feature memset(&tag, 0, sizeof(tag)) for feature in features: set_feature(&tag, feature, 1) return tag -cdef tag_to_json(RichTagC tag): +cdef tag_to_json(MorphAnalysisC tag): features = [] if tag.abbr != 0: features.append(NAMES[tag.abbr]) @@ -360,11 +360,11 @@ cdef tag_to_json(RichTagC tag): features.append(NAMES[tag.verb_type]) return features -cdef RichTagC tag_from_json(json_tag): - cdef RichTagC tag +cdef MorphAnalysisC tag_from_json(json_tag): + cdef MorphAnalysisC tag return tag -cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) except -1: +cdef int set_feature(MorphAnalysisC* tag, univ_morph_t feature, int value) except -1: if value == True: value_ = feature else: diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 9f7904919..7452123c0 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -74,4 +74,50 @@ cdef struct TokenC: hash_t ent_id +cdef struct MorphAnalysisC: + univ_pos_t pos + + attr_t abbr + attr_t adp_type + attr_t adv_type + attr_t animacy + attr_t aspect + attr_t case + attr_t conj_type + attr_t connegative + attr_t definite + attr_t degree + attr_t derivation + attr_t echo + attr_t foreign + attr_t gender + attr_t hyph + attr_t inf_form + attr_t mood + attr_t negative + attr_t number + attr_t name_type + attr_t noun_type + attr_t num_form + attr_t num_type + attr_t num_value + attr_t part_form + attr_t part_type + attr_t person + attr_t polite + attr_t polarity + attr_t poss + attr_t prefix + attr_t prep_case + attr_t pron_type + attr_t punct_side + attr_t punct_type + attr_t reflex + attr_t style + attr_t style_variant + attr_t tense + attr_t typo + attr_t verb_form + attr_t voice + attr_t verb_type diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index 09ab04d89..722f97994 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -1,10 +1,14 @@ from ..vocab cimport Vocab from ..typedefs cimport hash_t + cdef class Morphanalysis: """Control access to morphological features for a token.""" - def __init__(self, Vocab vocab, features=None): - pass + def __init__(self, Vocab vocab, features=tuple()): + self.vocab = vocab + self.key = self.vocab.morphology.add(features) + analysis = self.vocab.morphology.tags.get(self.key) + self.c = analysis[0] @classmethod def from_id(self, Vocab vocab, hash_t key): @@ -28,6 +32,12 @@ cdef class Morphanalysis: def __hash__(self): pass + def get(self, name): + pass + + def to_json(self): + pass + @property def is_base_form(self): pass @@ -44,17 +54,354 @@ cdef class Morphanalysis: def id(self): pass - def get(self, name): - pass + property abbr: + def __get__(self): + pass - def set(self, name, value): - pass + property adp_type: + def __get__(self): + pass - def add(self, feature): - pass + property adv_type: + def __get__(self): + pass - def remove(self, feature): - pass + property animacy: + def __get__(self): + pass - def to_json(self): - pass + property aspect: + def __get__(self): + pass + + property case: + def __get__(self): + pass + + property conj_type: + def __get__(self): + pass + + property connegative: + def __get__(self): + pass + + property definite: + def __get__(self): + pass + + property degree: + def __get__(self): + pass + + property derivation: + def __get__(self): + pass + + property echo: + def __get__(self): + pass + + property foreign: + def __get__(self): + pass + + property gender: + def __get__(self): + pass + + property hyph: + def __get__(self): + pass + + property inf_form: + def __get__(self): + pass + + property name_type: + def __get__(self): + pass + + property negative: + def __get__(self): + pass + + property mood: + def __get__(self): + pass + + property name_type: + def __get__(self): + pass + + property negative: + def __get__(self): + pass + + property number: + def __get__(self): + pass + + property num_form: + def __get__(self): + pass + + property num_type: + def __get__(self): + pass + + property num_value: + def __get__(self): + pass + + property part_form: + def __get__(self): + pass + + property part_type: + def __get__(self): + pass + + property person: + def __get__(self): + pass + + property polite: + def __get__(self): + pass + + property polarity: + def __get__(self): + pass + + property poss: + def __get__(self): + pass + + property prefix: + def __get__(self): + pass + + property prep_case: + def __get__(self): + pass + + property pron_type: + def __get__(self): + pass + + property punct_side: + def __get__(self): + pass + + property punct_type: + def __get__(self): + pass + + property reflex: + def __get__(self): + pass + + property style: + def __get__(self): + pass + + property style_variant: + def __get__(self): + pass + + property tense: + def __get__(self): + pass + + property typo: + def __get__(self): + pass + + property verb_form: + def __get__(self): + pass + + property voice: + def __get__(self): + pass + + property verb_type: + def __get__(self): + pass + + property abbr_: + def __get__(self): + pass + + property adp_type_: + def __get__(self): + pass + + property adv_type_: + def __get__(self): + pass + + property animacy_: + def __get__(self): + pass + + property aspect_: + def __get__(self): + pass + + property case_: + def __get__(self): + pass + + property conj_type_: + def __get__(self): + pass + + property connegative_: + def __get__(self): + pass + + property definite_: + def __get__(self): + pass + + property degree_: + def __get__(self): + pass + + property derivation_: + def __get__(self): + pass + + property echo_: + def __get__(self): + pass + + property foreign_: + def __get__(self): + pass + + property gender_: + def __get__(self): + pass + + property hyph_: + def __get__(self): + pass + + property inf_form_: + def __get__(self): + pass + + property name_type_: + def __get__(self): + pass + + property negative_: + def __get__(self): + pass + + property mood_: + def __get__(self): + pass + + property name_type_: + def __get__(self): + pass + + property negative_: + def __get__(self): + pass + + property number_: + def __get__(self): + pass + + property num_form_: + def __get__(self): + pass + + property num_type_: + def __get__(self): + pass + + property num_value_: + def __get__(self): + pass + + property part_form_: + def __get__(self): + pass + + property part_type_: + def __get__(self): + pass + + property person_: + def __get__(self): + pass + + property polite_: + def __get__(self): + pass + + property polarity_: + def __get__(self): + pass + + property poss_: + def __get__(self): + pass + + property prefix_: + def __get__(self): + pass + + property prep_case_: + def __get__(self): + pass + + property pron_type_: + def __get__(self): + pass + + property punct_side_: + def __get__(self): + pass + + property punct_type_: + def __get__(self): + pass + + property reflex_: + def __get__(self): + pass + + property style_: + def __get__(self): + pass + + property style_variant_: + def __get__(self): + pass + + property tense_: + def __get__(self): + pass + + property typo_: + def __get__(self): + pass + + property verb_form_: + def __get__(self): + pass + + property voice_: + def __get__(self): + pass + + property verb_type_: + def __get__(self): + pass