mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
WIP on supporting morphology features
This commit is contained in:
parent
ac5742223a
commit
6ae645c4ef
|
@ -1,48 +1,30 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMapArray
|
from preshed.maps cimport PreshMap
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
from murmurhash cimport mrmr
|
||||||
|
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .typedefs cimport attr_t, flags_t
|
from .typedefs cimport hash_t, attr_t, flags_t
|
||||||
from .parts_of_speech cimport univ_pos_t
|
from .parts_of_speech cimport univ_pos_t
|
||||||
|
|
||||||
from . cimport symbols
|
from . cimport symbols
|
||||||
|
|
||||||
|
|
||||||
cdef struct RichTagC:
|
|
||||||
uint64_t morph
|
|
||||||
int id
|
|
||||||
univ_pos_t pos
|
|
||||||
attr_t name
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct MorphAnalysisC:
|
|
||||||
RichTagC tag
|
|
||||||
attr_t lemma
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
cdef readonly Pool mem
|
cdef readonly Pool mem
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
|
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
||||||
|
|
||||||
cdef public object lemmatizer
|
cdef public object lemmatizer
|
||||||
cdef readonly object tag_map
|
cdef readonly object tag_map
|
||||||
cdef public object n_tags
|
|
||||||
cdef public object reverse_index
|
|
||||||
cdef public object tag_names
|
|
||||||
cdef public object exc
|
|
||||||
|
|
||||||
cdef RichTagC* rich_tags
|
|
||||||
cdef PreshMapArray _cache
|
|
||||||
|
|
||||||
|
cdef hash_t insert(self, RichTagC tag) except 0
|
||||||
|
|
||||||
cdef int assign_untagged(self, TokenC* token) except -1
|
cdef int assign_untagged(self, TokenC* token) except -1
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||||
|
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
||||||
|
cdef update_token_morph(self, TokenC* token, features)
|
||||||
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
cdef set_token_morph(self, TokenC* token, pos, features)
|
||||||
|
|
||||||
|
|
||||||
cdef enum univ_morph_t:
|
cdef enum univ_morph_t:
|
||||||
NIL = 0
|
NIL = 0
|
||||||
|
@ -298,4 +280,47 @@ cdef enum univ_morph_t:
|
||||||
VerbType_mod # U
|
VerbType_mod # U
|
||||||
VerbType_light # U
|
VerbType_light # U
|
||||||
|
|
||||||
|
cdef struct RichTagC:
|
||||||
|
univ_pos_t pos
|
||||||
|
|
||||||
|
univ_morph_t abbr
|
||||||
|
univ_morph_t adp_type
|
||||||
|
univ_morph_t adv_type
|
||||||
|
univ_morph_t animacy
|
||||||
|
univ_morph_t aspect
|
||||||
|
univ_morph_t case
|
||||||
|
univ_morph_t conj_type
|
||||||
|
univ_morph_t connegative
|
||||||
|
univ_morph_t definite
|
||||||
|
univ_morph_t degree
|
||||||
|
univ_morph_t derivation
|
||||||
|
univ_morph_t echo
|
||||||
|
univ_morph_t foreign
|
||||||
|
univ_morph_t gender
|
||||||
|
univ_morph_t hyph
|
||||||
|
univ_morph_t inf_form
|
||||||
|
univ_morph_t mood
|
||||||
|
univ_morph_t negative
|
||||||
|
univ_morph_t number
|
||||||
|
univ_morph_t name_type
|
||||||
|
univ_morph_t num_form
|
||||||
|
univ_morph_t num_type
|
||||||
|
univ_morph_t num_value
|
||||||
|
univ_morph_t part_form
|
||||||
|
univ_morph_t part_type
|
||||||
|
univ_morph_t person
|
||||||
|
univ_morph_t polite
|
||||||
|
univ_morph_t polarity
|
||||||
|
univ_morph_t poss
|
||||||
|
univ_morph_t prefix
|
||||||
|
univ_morph_t prep_case
|
||||||
|
univ_morph_t pron_type
|
||||||
|
univ_morph_t punct_side
|
||||||
|
univ_morph_t punct_type
|
||||||
|
univ_morph_t reflex
|
||||||
|
univ_morph_t style
|
||||||
|
univ_morph_t style_variant
|
||||||
|
univ_morph_t tense
|
||||||
|
univ_morph_t verb_form
|
||||||
|
univ_morph_t voice
|
||||||
|
univ_morph_t verb_type
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
import ujson as json
|
||||||
|
|
||||||
from .attrs cimport POS, IS_SPACE
|
from .attrs cimport POS, IS_SPACE
|
||||||
from .attrs import LEMMA, intify_attrs
|
from .attrs import LEMMA, intify_attrs
|
||||||
|
@ -12,6 +13,7 @@ from .lexeme cimport Lexeme
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_props(props):
|
def _normalize_props(props):
|
||||||
"""Transform deprecated string keys to correct names."""
|
"""Transform deprecated string keys to correct names."""
|
||||||
out = {}
|
out = {}
|
||||||
|
@ -32,9 +34,17 @@ def _normalize_props(props):
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
|
'''Store the possible morphological analyses for a language, and index them
|
||||||
|
by hash.
|
||||||
|
|
||||||
|
To save space on each token, tokens only know the hash of their morphological
|
||||||
|
analysis, so queries of morphological attributes are delegated
|
||||||
|
to this class.
|
||||||
|
'''
|
||||||
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_store
|
self.strings = string_store
|
||||||
|
self.tags = PreshMap()
|
||||||
# Add special space symbol. We prefix with underscore, to make sure it
|
# Add special space symbol. We prefix with underscore, to make sure it
|
||||||
# always sorts to the end.
|
# always sorts to the end.
|
||||||
space_attrs = tag_map.get('SP', {POS: SPACE})
|
space_attrs = tag_map.get('SP', {POS: SPACE})
|
||||||
|
@ -47,32 +57,46 @@ cdef class Morphology:
|
||||||
self.lemmatizer = lemmatizer
|
self.lemmatizer = lemmatizer
|
||||||
self.n_tags = len(tag_map)
|
self.n_tags = len(tag_map)
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
|
|
||||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
self.strings.add(tag_str)
|
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
attrs = _normalize_props(attrs)
|
self.reverse_index[i] = self.strings.add(tag_str)
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
|
||||||
self.rich_tags[i].id = i
|
|
||||||
self.rich_tags[i].name = self.strings.add(tag_str)
|
|
||||||
self.rich_tags[i].morph = 0
|
|
||||||
self.rich_tags[i].pos = attrs[POS]
|
|
||||||
self.reverse_index[self.rich_tags[i].name] = i
|
|
||||||
# Add a 'null' tag, which we can reference when assign morphology to
|
|
||||||
# untagged tokens.
|
|
||||||
self.rich_tags[self.n_tags].id = self.n_tags
|
|
||||||
|
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
self.exc = {}
|
self.exc = {}
|
||||||
if exc is not None:
|
if exc is not None:
|
||||||
for (tag_str, orth_str), attrs in exc.items():
|
for (tag_str, orth_str), attrs in exc.items():
|
||||||
self.add_special_case(tag_str, orth_str, attrs)
|
self.add_special_case(tag_str, orth_str, attrs)
|
||||||
|
|
||||||
|
def add(self, features):
|
||||||
|
"""Insert a morphological analysis in the morphology table, if not already
|
||||||
|
present. Returns the hash of the new analysis.
|
||||||
|
"""
|
||||||
|
features = intify_features(self.strings, features)
|
||||||
|
cdef RichTagC tag = create_rich_tag(features)
|
||||||
|
cdef hash_t key = self.insert(tag)
|
||||||
|
return key
|
||||||
|
|
||||||
def __reduce__(self):
|
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
if orth not in self.strings:
|
||||||
self.exc), None, None)
|
return orth
|
||||||
|
cdef unicode py_string = self.strings[orth]
|
||||||
|
if self.lemmatizer is None:
|
||||||
|
return self.strings.add(py_string.lower())
|
||||||
|
cdef list lemma_strings
|
||||||
|
cdef unicode lemma_string
|
||||||
|
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||||
|
lemma_string = lemma_strings[0]
|
||||||
|
lemma = self.strings.add(lemma_string)
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
cdef hash_t insert(self, RichTagC tag) except 0:
|
||||||
|
cdef hash_t key = hash_tag(tag)
|
||||||
|
if self.tags.get(key) == NULL:
|
||||||
|
tag_ptr = <RichTagC*>self.mem.alloc(1, sizeof(RichTagC))
|
||||||
|
tag_ptr[0] = tag
|
||||||
|
self.tags.set(key, <void*>tag_ptr)
|
||||||
|
return key
|
||||||
|
|
||||||
cdef int assign_untagged(self, TokenC* token) except -1:
|
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||||
"""Set morphological attributes on a token without a POS tag. Uses
|
"""Set morphological attributes on a token without a POS tag. Uses
|
||||||
the lemmatizer's lookup() method, which looks up the string in the
|
the lemmatizer's lookup() method, which looks up the string in the
|
||||||
|
@ -101,84 +125,284 @@ cdef class Morphology:
|
||||||
# figure out why the statistical model fails. Related to Issue #220
|
# figure out why the statistical model fails. Related to Issue #220
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
if lemma == 0:
|
||||||
if analysis is NULL:
|
tag_str = self.tag_names[tag_id]
|
||||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
features = dict(self.tag_map.get(tag_str, {}))
|
||||||
tag_str = self.strings[self.rich_tags[tag_id].name]
|
pos = self.strings.as_int(features.pop('POS'))
|
||||||
analysis.tag = rich_tag
|
lemma = self.lemmatize(pos, token.lex.orth, features)
|
||||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
self._cache.set(tag_id, token.lex.orth, lemma)
|
||||||
self.tag_map.get(tag_str, {}))
|
token.lemma = lemma
|
||||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
token.pos = pos
|
||||||
token.lemma = analysis.lemma
|
token.tag = self.strings[tag_str]
|
||||||
token.pos = analysis.tag.pos
|
token.morph = self.add(attrs)
|
||||||
token.tag = analysis.tag.name
|
|
||||||
token.morph = analysis.tag.morph
|
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
|
cdef update_morph(self, hash_t morph, features):
|
||||||
cdef flags_t one = 1
|
"""Update a morphological analysis with new feature values."""
|
||||||
if value:
|
tag = (<RichTagC*>self.tags.get(morph))[0]
|
||||||
flags[0] |= one << flag_id
|
cdef univ_morph_t feature
|
||||||
else:
|
cdef int value
|
||||||
flags[0] &= ~(one << flag_id)
|
for feature_, value in features.items():
|
||||||
|
feature = self.strings.as_int(feature_)
|
||||||
|
set_feature(&tag, feature, 1)
|
||||||
|
morph = self.insert_tag(tag)
|
||||||
|
return morph
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
def to_bytes(self):
|
||||||
force=False):
|
json_tags = []
|
||||||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
for key in self.tags:
|
||||||
tag and orth match the rule will receive the specified properties.
|
tag_ptr = <RichTagC*>self.tags.get(key)
|
||||||
|
if tag_ptr != NULL:
|
||||||
|
json_tags.append(tag_to_json(tag_ptr[0]))
|
||||||
|
raise json.dumps(json_tags)
|
||||||
|
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
def from_bytes(self, byte_string):
|
||||||
orth (unicode): The word-form to key the exception.
|
raise NotImplementedError
|
||||||
"""
|
|
||||||
# TODO: Currently we've assumed that we know the number of tags --
|
|
||||||
# RichTagC is an array, and _cache is a PreshMapArray
|
|
||||||
# This is really bad: it makes the morphology typed to the tagger
|
|
||||||
# classes, which is all wrong.
|
|
||||||
self.exc[(tag_str, orth_str)] = dict(attrs)
|
|
||||||
tag = self.strings.add(tag_str)
|
|
||||||
if tag not in self.reverse_index:
|
|
||||||
return
|
|
||||||
tag_id = self.reverse_index[tag]
|
|
||||||
orth = self.strings[orth_str]
|
|
||||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
|
||||||
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
|
|
||||||
if cached is NULL:
|
|
||||||
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
|
||||||
elif force:
|
|
||||||
memset(cached, 0, sizeof(cached[0]))
|
|
||||||
else:
|
|
||||||
raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))
|
|
||||||
|
|
||||||
cached.tag = rich_tag
|
def to_disk(self, path):
|
||||||
# TODO: Refactor this to take arbitrary attributes.
|
raise NotImplementedError
|
||||||
for name_id, value_id in attrs.items():
|
|
||||||
if name_id == LEMMA:
|
def from_disk(self, path):
|
||||||
cached.lemma = value_id
|
raise NotImplementedError
|
||||||
else:
|
|
||||||
self.assign_feature(&cached.tag.morph, name_id, value_id)
|
|
||||||
if cached.lemma == 0:
|
cpdef univ_pos_t get_int_tag(pos_):
|
||||||
cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
|
return <univ_pos_t>0
|
||||||
self._cache.set(tag_id, orth, <void*>cached)
|
|
||||||
|
cpdef intify_features(StringStore strings, features):
|
||||||
|
return {strings.as_int(feature) for feature in features}
|
||||||
|
|
||||||
|
cdef hash_t hash_tag(RichTagC tag) nogil:
|
||||||
|
return mrmr.hash64(&tag, sizeof(tag), 0)
|
||||||
|
|
||||||
|
cdef RichTagC create_rich_tag(pos_, features):
|
||||||
|
cdef RichTagC tag
|
||||||
|
cdef univ_morph_t feature
|
||||||
|
tag.pos = get_int_tag(pos_)
|
||||||
|
for feature in features:
|
||||||
|
set_feature(&tag, feature, 1)
|
||||||
|
return tag
|
||||||
|
|
||||||
|
cdef tag_to_json(RichTagC tag):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
cdef RichTagC tag_from_json(json_tag):
|
||||||
|
cdef RichTagC tag
|
||||||
|
return tag
|
||||||
|
|
||||||
|
cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil:
|
||||||
|
if value == True:
|
||||||
|
value_ = feature
|
||||||
|
else:
|
||||||
|
value_ = NIL
|
||||||
|
if feature == NIL:
|
||||||
|
pass
|
||||||
|
if is_abbr_feature(feature):
|
||||||
|
tag.abbr = value_
|
||||||
|
elif is_adp_type_feature(feature):
|
||||||
|
tag.adp_type = value_
|
||||||
|
elif is_adv_type_feature(feature):
|
||||||
|
tag.adv_type = value_
|
||||||
|
elif is_animacy_feature(feature):
|
||||||
|
tag.animacy = value_
|
||||||
|
elif is_aspect_feature(feature):
|
||||||
|
tag.aspect = value_
|
||||||
|
elif is_case_feature(feature):
|
||||||
|
tag.case = value_
|
||||||
|
elif is_conj_type_feature(feature):
|
||||||
|
tag.conj_type = value_
|
||||||
|
elif is_connegative_feature(feature):
|
||||||
|
tag.connegative = value_
|
||||||
|
elif is_definite_feature(feature):
|
||||||
|
tag.definite = value_
|
||||||
|
elif is_degree_feature(feature):
|
||||||
|
tag.degree = value_
|
||||||
|
elif is_derivation_feature(feature):
|
||||||
|
tag.derivation = value_
|
||||||
|
elif is_echo_feature(feature):
|
||||||
|
tag.echo = value_
|
||||||
|
elif is_foreign_feature(feature):
|
||||||
|
tag.foreign = value_
|
||||||
|
elif is_gender_feature(feature):
|
||||||
|
tag.gender = value_
|
||||||
|
elif is_hyph_feature(feature):
|
||||||
|
tag.hyph = value_
|
||||||
|
elif is_inf_form_feature(feature):
|
||||||
|
tag.inf_form = value_
|
||||||
|
elif is_mood_feature(feature):
|
||||||
|
tag.mood = value_
|
||||||
|
elif is_negative_feature(feature):
|
||||||
|
tag.negative = value_
|
||||||
|
elif is_number_feature(feature):
|
||||||
|
tag.number = value_
|
||||||
|
elif is_name_type_feature(feature):
|
||||||
|
tag.name_type = value_
|
||||||
|
elif is_num_form_feature(feature):
|
||||||
|
tag.num_form = value_
|
||||||
|
elif is_num_value_feature(feature):
|
||||||
|
tag.num_value = value_
|
||||||
|
elif is_part_form_feature(feature):
|
||||||
|
tag.part_form = value_
|
||||||
|
elif is_part_type_feature(feature):
|
||||||
|
tag.part_type = value_
|
||||||
|
elif is_person_feature(feature):
|
||||||
|
tag.person = value_
|
||||||
|
elif is_polite_feature(feature):
|
||||||
|
tag.polite = value_
|
||||||
|
elif is_polarity_feature(feature):
|
||||||
|
tag.polarity = value_
|
||||||
|
elif is_poss_feature(feature):
|
||||||
|
tag.poss = value_
|
||||||
|
elif is_prefix_feature(feature):
|
||||||
|
tag.prefix = value_
|
||||||
|
elif is_prep_case_feature(feature):
|
||||||
|
tag.prep_case = value_
|
||||||
|
elif is_pron_type_feature(feature):
|
||||||
|
tag.pron_type = value_
|
||||||
|
elif is_punct_side_feature(feature):
|
||||||
|
tag.punct_type = value_
|
||||||
|
elif is_reflex_feature(feature):
|
||||||
|
tag.reflex = value_
|
||||||
|
elif is_style_feature(feature):
|
||||||
|
tag.style = value_
|
||||||
|
elif is_style_variant_feature(feature):
|
||||||
|
tag.style_variant = value_
|
||||||
|
elif is_tense_feature(feature):
|
||||||
|
tag.tense = value_
|
||||||
|
elif is_verb_form_feature(feature):
|
||||||
|
tag.verb_form = value_
|
||||||
|
elif is_voice_feature(feature):
|
||||||
|
tag.voice = value_
|
||||||
|
elif is_verb_type_feature(feature):
|
||||||
|
tag.verb_type = value_
|
||||||
|
else:
|
||||||
|
with gil:
|
||||||
|
raise ValueError("Unknown feature: %d" % feature)
|
||||||
|
|
||||||
|
cdef int is_abbr_feature(univ_morph_t abbr) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_adp_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_adv_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_animacy_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_aspect_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_case_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_conj_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_connegative_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_definite_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_degree_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_derivation_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_echo_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_foreign_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_gender_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_hyph_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_inf_form_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_mood_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_negative_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_number_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_name_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_num_form_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_num_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_num_value_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_part_form_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_part_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_person_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_polite_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_polarity_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_poss_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_prefix_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_prep_case_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_pron_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_punct_side_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_punct_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_reflex_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_style_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_style_variant_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_tense_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_verb_form_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_voice_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int is_verb_type_feature(univ_morph_t feature) nogil:
|
||||||
|
return 0
|
||||||
|
|
||||||
def load_morph_exceptions(self, dict exc):
|
|
||||||
# Map (form, pos) to (lemma, rich tag)
|
|
||||||
for tag_str, entries in exc.items():
|
|
||||||
for form_str, attrs in entries.items():
|
|
||||||
self.add_special_case(tag_str, form_str, attrs)
|
|
||||||
|
|
||||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
|
||||||
if orth not in self.strings:
|
|
||||||
return orth
|
|
||||||
cdef unicode py_string = self.strings[orth]
|
|
||||||
if self.lemmatizer is None:
|
|
||||||
return self.strings.add(py_string.lower())
|
|
||||||
cdef list lemma_strings
|
|
||||||
cdef unicode lemma_string
|
|
||||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
|
||||||
lemma_string = lemma_strings[0]
|
|
||||||
lemma = self.strings.add(lemma_string)
|
|
||||||
return lemma
|
|
||||||
|
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user