spaCy/spacy/morphology.pyx

988 lines
33 KiB
Cython
Raw Normal View History

# cython: infer_types
# coding: utf8
from __future__ import unicode_literals
from libc.string cimport memset
2019-03-07 03:17:19 +03:00
import srsly
2019-03-07 03:17:19 +03:00
from .strings import get_string_id
2018-09-25 16:18:21 +03:00
from . import symbols
from .attrs cimport POS, IS_SPACE
2017-10-27 22:07:59 +03:00
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
2016-01-19 05:36:51 +03:00
from .lexeme cimport Lexeme
from .errors import Errors
2015-08-28 00:11:51 +03:00
2018-09-25 00:57:41 +03:00
def _normalize_props(props):
2017-10-27 22:07:59 +03:00
"""Transform deprecated string keys to correct names."""
out = {}
2018-09-25 16:18:21 +03:00
props = dict(props)
2018-09-26 22:03:57 +03:00
for key in FIELDS:
2018-09-25 16:18:21 +03:00
if key in props:
attr = '%s_%s' % (key, props[key])
if attr in IDS:
props.pop(key)
props[attr] = True
for key, value in props.items():
if key == POS:
if hasattr(value, 'upper'):
value = value.upper()
if value in POS_IDS:
value = POS_IDS[value]
out[key] = value
2017-03-25 23:56:41 +03:00
elif isinstance(key, int):
out[key] = value
elif key.lower() == 'pos':
out[POS] = POS_IDS[value.upper()]
else:
out[key] = value
return out
def parse_feature(feature):
if not hasattr(feature, 'split'):
feature = NAMES[feature]
key, value = feature.split('_')
begin = 'begin_%s' % key
2018-09-26 22:03:57 +03:00
# Note that this includes a 0 offset for the field, for no entry
offset = IDS[feature] - IDS[begin]
field_id = FIELDS[key]
return (field_id, offset)
2018-09-26 00:03:43 +03:00
def get_field_size(field):
begin = 'begin_%s' % field
end = 'end_%s' % field
2018-09-26 22:03:57 +03:00
# Extra field for no entry -- always 0
return IDS[end] - IDS[begin]
2018-09-26 00:03:43 +03:00
cdef class Morphology:
2018-09-25 00:57:41 +03:00
'''Store the possible morphological analyses for a language, and index them
by hash.
To save space on each token, tokens only know the hash of their morphological
analysis, so queries of morphological attributes are delegated
to this class.
'''
2017-06-05 00:34:32 +03:00
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool()
self.strings = string_store
2018-09-25 00:57:41 +03:00
self.tags = PreshMap()
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
space_attrs = tag_map.get('SP', {POS: SPACE})
if '_SP' not in tag_map:
self.strings.add('_SP')
tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs
self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_map = {}
self.lemmatizer = lemmatizer
2017-06-03 21:29:23 +03:00
self.n_tags = len(tag_map)
self.reverse_index = {}
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
2018-09-25 16:18:21 +03:00
attrs = _normalize_props(attrs)
2017-03-25 23:56:41 +03:00
self.tag_map[tag_str] = dict(attrs)
2018-09-25 11:57:33 +03:00
self.reverse_index[self.strings.add(tag_str)] = i
2017-10-11 11:33:06 +03:00
self._cache = PreshMapArray(self.n_tags)
2017-06-05 00:34:32 +03:00
self.exc = {}
if exc is not None:
2018-09-25 16:18:21 +03:00
for (tag, orth), attrs in exc.items():
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
2018-09-25 11:57:33 +03:00
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
2018-09-25 00:57:41 +03:00
def add(self, features):
"""Insert a morphological analysis in the morphology table, if not already
present. Returns the hash of the new analysis.
"""
2018-09-25 21:53:24 +03:00
features = intify_features(features)
2018-09-26 22:03:57 +03:00
cdef univ_morph_t feature
for feature in features:
if feature != 0 and feature not in NAMES:
print(list(NAMES.keys())[:10])
print(NAMES.get(feature-1), NAMES.get(feature+1))
raise KeyError("Unknown feature: %d" % feature)
cdef RichTagC tag
tag = create_rich_tag(features)
2018-09-25 00:57:41 +03:00
cdef hash_t key = self.insert(tag)
return key
2018-09-26 22:03:57 +03:00
def get(self, hash_t morph):
tag = <RichTagC*>self.tags.get(morph)
if tag == NULL:
return []
else:
return tag_to_json(tag[0])
2018-09-25 21:53:24 +03:00
2018-09-25 22:07:08 +03:00
cpdef update(self, hash_t morph, features):
2018-09-25 21:53:24 +03:00
"""Update a morphological analysis with new feature values."""
tag = (<RichTagC*>self.tags.get(morph))[0]
2018-09-25 22:07:08 +03:00
features = intify_features(features)
2018-09-25 21:53:24 +03:00
cdef univ_morph_t feature
2018-09-25 22:07:08 +03:00
for feature in features:
2018-09-25 21:53:24 +03:00
set_feature(&tag, feature, 1)
2018-09-25 22:07:08 +03:00
morph = self.insert(tag)
2018-09-25 21:53:24 +03:00
return morph
2018-09-25 00:57:41 +03:00
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
return orth
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings.add(py_string.lower())
cdef list lemma_strings
cdef unicode lemma_string
# Normalize features into a dict keyed by the field, to make life easier
# for the lemmatizer. Handles string-to-int conversion too.
string_feats = {}
for key, value in morphology.items():
if value is True:
name, value = self.strings.as_string(key).split('_', 1)
string_feats[name] = value
else:
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
2018-09-25 00:57:41 +03:00
lemma_string = lemma_strings[0]
lemma = self.strings.add(lemma_string)
return lemma
2018-09-25 11:57:33 +03:00
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
2018-09-25 16:18:21 +03:00
attrs = dict(attrs)
attrs = _normalize_props(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
2018-09-25 00:57:41 +03:00
cdef hash_t insert(self, RichTagC tag) except 0:
cdef hash_t key = hash_tag(tag)
if self.tags.get(key) == NULL:
tag_ptr = <RichTagC*>self.mem.alloc(1, sizeof(RichTagC))
tag_ptr[0] = tag
self.tags.set(key, <void*>tag_ptr)
return key
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
2017-10-27 22:07:59 +03:00
table provided by the language data as lemma_lookup (if available).
"""
2017-10-11 11:33:06 +03:00
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str)
token.lemma = self.strings.add(lemma)
2018-09-25 11:57:33 +03:00
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
cdef attr_t tag = self.strings.as_int(tag_str)
2017-05-28 15:06:40 +03:00
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
2017-05-28 15:06:40 +03:00
self.assign_tag_id(token, tag_id)
else:
token.tag = tag
2016-11-04 21:19:09 +03:00
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError(Errors.E014.format(tag=tag_id))
# Ensure spaces get tagged as space.
# It seems pretty arbitrary to put this logic here, but there's really
# nowhere better. I guess the justification is that this is where the
# specific word and the tag interact. Still, we should have a better
# way to enforce this rule, or figure out why the statistical model fails.
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
2018-09-25 01:35:59 +03:00
tag_str = self.tag_names[tag_id]
features = dict(self.tag_map.get(tag_str, {}))
2018-09-25 16:18:21 +03:00
if features:
2018-09-25 11:57:33 +03:00
pos = self.strings.as_int(features.pop(POS))
else:
pos = 0
2018-09-25 16:18:21 +03:00
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
if lemma == 0:
2019-03-07 14:14:37 +03:00
# Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
2018-09-25 16:18:21 +03:00
lemma = self.lemmatize(pos, token.lex.orth, features)
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
2018-09-25 00:57:41 +03:00
token.lemma = lemma
2018-09-25 11:57:33 +03:00
token.pos = <univ_pos_t>pos
2018-09-25 00:57:41 +03:00
token.tag = self.strings[tag_str]
2018-09-26 22:03:57 +03:00
#token.morph = self.add(features)
token.morph = 0
2018-09-25 16:18:21 +03:00
if (self.tag_names[tag_id], token.lex.orth) in self.exc:
self._assign_tag_from_exceptions(token, tag_id)
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
key = (self.tag_names[tag_id], token.lex.orth)
cdef dict attrs
attrs = self.exc[key]
token.pos = attrs.get(POS, token.pos)
token.lemma = attrs.get(LEMMA, token.lemma)
2018-09-25 16:18:21 +03:00
def load_morph_exceptions(self, dict exc):
# Map (form, pos) to attributes
2018-09-25 16:18:21 +03:00
for tag_str, entries in exc.items():
for form_str, attrs in entries.items():
self.add_special_case(tag_str, form_str, attrs)
2018-09-25 00:57:41 +03:00
def to_bytes(self):
json_tags = []
for key in self.tags:
tag_ptr = <RichTagC*>self.tags.get(key)
if tag_ptr != NULL:
json_tags.append(tag_to_json(tag_ptr[0]))
2019-03-07 03:17:19 +03:00
return srsly.json_dumps(json_tags)
2018-09-25 00:57:41 +03:00
def from_bytes(self, byte_string):
raise NotImplementedError
def to_disk(self, path):
raise NotImplementedError
def from_disk(self, path):
raise NotImplementedError
cpdef univ_pos_t get_int_tag(pos_):
return <univ_pos_t>0
2018-09-25 21:53:24 +03:00
cpdef intify_features(features):
return {IDS.get(feature, feature) for feature in features}
2018-09-25 00:57:41 +03:00
cdef hash_t hash_tag(RichTagC tag) nogil:
return mrmr.hash64(&tag, sizeof(tag), 0)
2018-09-26 22:03:57 +03:00
cdef RichTagC create_rich_tag(features) except *:
2018-09-25 00:57:41 +03:00
cdef RichTagC tag
cdef univ_morph_t feature
2018-09-25 22:07:08 +03:00
memset(&tag, 0, sizeof(tag))
for feature in features:
set_feature(&tag, feature, 1)
2018-09-25 00:57:41 +03:00
return tag
cdef tag_to_json(RichTagC tag):
2018-09-26 22:03:57 +03:00
features = []
if tag.abbr != 0:
features.append(NAMES[tag.abbr])
if tag.adp_type != 0:
features.append(NAMES[tag.adp_type])
if tag.adv_type != 0:
features.append(NAMES[tag.adv_type])
if tag.animacy != 0:
features.append(NAMES[tag.animacy])
if tag.aspect != 0:
features.append(NAMES[tag.aspect])
if tag.case != 0:
features.append(NAMES[tag.case])
if tag.conj_type != 0:
features.append(NAMES[tag.conj_type])
if tag.connegative != 0:
features.append(NAMES[tag.connegative])
if tag.definite != 0:
features.append(NAMES[tag.definite])
if tag.degree != 0:
features.append(NAMES[tag.degree])
if tag.derivation != 0:
features.append(NAMES[tag.derivation])
if tag.echo != 0:
features.append(NAMES[tag.echo])
if tag.foreign != 0:
features.append(NAMES[tag.foreign])
if tag.gender != 0:
features.append(NAMES[tag.gender])
if tag.hyph != 0:
features.append(NAMES[tag.hyph])
if tag.inf_form != 0:
features.append(NAMES[tag.inf_form])
if tag.mood != 0:
features.append(NAMES[tag.mood])
if tag.negative != 0:
features.append(NAMES[tag.negative])
if tag.number != 0:
features.append(NAMES[tag.number])
if tag.name_type != 0:
features.append(NAMES[tag.name_type])
if tag.noun_type != 0:
features.append(NAMES[tag.noun_type])
if tag.num_form != 0:
features.append(NAMES[tag.num_form])
if tag.num_type != 0:
features.append(NAMES[tag.num_type])
if tag.num_value != 0:
features.append(NAMES[tag.num_value])
if tag.part_form != 0:
features.append(NAMES[tag.part_form])
if tag.part_type != 0:
features.append(NAMES[tag.part_type])
if tag.person != 0:
features.append(NAMES[tag.person])
if tag.polite != 0:
features.append(NAMES[tag.polite])
if tag.polarity != 0:
features.append(NAMES[tag.polarity])
if tag.poss != 0:
features.append(NAMES[tag.poss])
if tag.prefix != 0:
features.append(NAMES[tag.prefix])
if tag.prep_case != 0:
features.append(NAMES[tag.prep_case])
if tag.pron_type != 0:
features.append(NAMES[tag.pron_type])
if tag.punct_side != 0:
features.append(NAMES[tag.punct_side])
if tag.punct_type != 0:
features.append(NAMES[tag.punct_type])
if tag.reflex != 0:
features.append(NAMES[tag.reflex])
if tag.style != 0:
features.append(NAMES[tag.style])
if tag.style_variant != 0:
features.append(NAMES[tag.style_variant])
if tag.tense != 0:
features.append(NAMES[tag.tense])
if tag.verb_form != 0:
features.append(NAMES[tag.verb_form])
if tag.voice != 0:
features.append(NAMES[tag.voice])
if tag.verb_type != 0:
features.append(NAMES[tag.verb_type])
return features
2018-09-25 00:57:41 +03:00
cdef RichTagC tag_from_json(json_tag):
cdef RichTagC tag
return tag
2018-09-26 22:03:57 +03:00
cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) except -1:
2018-09-25 00:57:41 +03:00
if value == True:
value_ = feature
else:
value_ = NIL
if feature == NIL:
pass
2018-09-26 22:03:57 +03:00
elif is_abbr_feature(feature):
2018-09-25 00:57:41 +03:00
tag.abbr = value_
elif is_adp_type_feature(feature):
tag.adp_type = value_
elif is_adv_type_feature(feature):
tag.adv_type = value_
elif is_animacy_feature(feature):
tag.animacy = value_
elif is_aspect_feature(feature):
tag.aspect = value_
elif is_case_feature(feature):
tag.case = value_
elif is_conj_type_feature(feature):
tag.conj_type = value_
elif is_connegative_feature(feature):
tag.connegative = value_
elif is_definite_feature(feature):
tag.definite = value_
elif is_degree_feature(feature):
tag.degree = value_
elif is_derivation_feature(feature):
tag.derivation = value_
elif is_echo_feature(feature):
tag.echo = value_
elif is_foreign_feature(feature):
tag.foreign = value_
elif is_gender_feature(feature):
tag.gender = value_
elif is_hyph_feature(feature):
tag.hyph = value_
elif is_inf_form_feature(feature):
tag.inf_form = value_
elif is_mood_feature(feature):
tag.mood = value_
elif is_negative_feature(feature):
tag.negative = value_
elif is_number_feature(feature):
tag.number = value_
elif is_name_type_feature(feature):
tag.name_type = value_
2018-09-26 22:03:57 +03:00
elif is_noun_type_feature(feature):
tag.noun_type = value_
2018-09-25 00:57:41 +03:00
elif is_num_form_feature(feature):
tag.num_form = value_
2018-09-26 22:03:57 +03:00
elif is_num_type_feature(feature):
tag.num_type = value_
2018-09-25 00:57:41 +03:00
elif is_num_value_feature(feature):
tag.num_value = value_
elif is_part_form_feature(feature):
tag.part_form = value_
elif is_part_type_feature(feature):
tag.part_type = value_
elif is_person_feature(feature):
tag.person = value_
elif is_polite_feature(feature):
tag.polite = value_
elif is_polarity_feature(feature):
tag.polarity = value_
elif is_poss_feature(feature):
tag.poss = value_
elif is_prefix_feature(feature):
tag.prefix = value_
elif is_prep_case_feature(feature):
tag.prep_case = value_
elif is_pron_type_feature(feature):
tag.pron_type = value_
elif is_punct_side_feature(feature):
2018-09-26 22:03:57 +03:00
tag.punct_side = value_
elif is_punct_type_feature(feature):
2018-09-25 00:57:41 +03:00
tag.punct_type = value_
elif is_reflex_feature(feature):
tag.reflex = value_
elif is_style_feature(feature):
tag.style = value_
elif is_style_variant_feature(feature):
tag.style_variant = value_
elif is_tense_feature(feature):
tag.tense = value_
2018-09-26 22:03:57 +03:00
elif is_typo_feature(feature):
tag.typo = value_
2018-09-25 00:57:41 +03:00
elif is_verb_form_feature(feature):
tag.verb_form = value_
elif is_voice_feature(feature):
tag.voice = value_
elif is_verb_type_feature(feature):
tag.verb_type = value_
else:
2018-09-26 22:03:57 +03:00
raise ValueError("Unknown feature: %s (%d)" % (NAMES.get(feature), feature))
2018-09-25 00:57:41 +03:00
cdef int is_abbr_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Abbr and feature <= end_Abbr
2018-09-25 00:57:41 +03:00
cdef int is_adp_type_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_AdpType and feature <= end_AdpType
2018-09-25 00:57:41 +03:00
cdef int is_adv_type_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_AdvType and feature <= end_AdvType
2018-09-25 00:57:41 +03:00
cdef int is_animacy_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Animacy and feature <= end_Animacy
2018-09-25 00:57:41 +03:00
cdef int is_aspect_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Aspect and feature <= end_Aspect
2018-09-25 00:57:41 +03:00
cdef int is_case_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Case and feature <= end_Case
2018-09-25 00:57:41 +03:00
cdef int is_conj_type_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_ConjType and feature <= end_ConjType
2018-09-25 00:57:41 +03:00
cdef int is_connegative_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Connegative and feature <= end_Connegative
2018-09-25 00:57:41 +03:00
cdef int is_definite_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Definite and feature <= end_Definite
2018-09-25 00:57:41 +03:00
cdef int is_degree_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Degree and feature <= end_Degree
2018-09-25 00:57:41 +03:00
cdef int is_derivation_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Derivation and feature <= end_Derivation
2018-09-25 00:57:41 +03:00
cdef int is_echo_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Echo and feature <= end_Echo
2018-09-25 00:57:41 +03:00
cdef int is_foreign_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Foreign and feature <= end_Foreign
2018-09-25 00:57:41 +03:00
cdef int is_gender_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Gender and feature <= end_Gender
2018-09-25 00:57:41 +03:00
cdef int is_hyph_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Hyph and feature <= end_Hyph
2018-09-25 00:57:41 +03:00
cdef int is_inf_form_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_InfForm and feature <= end_InfForm
2018-09-25 00:57:41 +03:00
cdef int is_mood_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Mood and feature <= end_Mood
cdef int is_name_type_feature(univ_morph_t feature) nogil:
return feature >= begin_NameType and feature < end_NameType
2018-09-25 00:57:41 +03:00
cdef int is_negative_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Negative and feature <= end_Negative
2018-09-25 00:57:41 +03:00
2018-09-26 22:03:57 +03:00
cdef int is_noun_type_feature(univ_morph_t feature) nogil:
return feature >= begin_NounType and feature <= end_NounType
2018-09-25 00:57:41 +03:00
2018-09-26 22:03:57 +03:00
cdef int is_number_feature(univ_morph_t feature) nogil:
return feature >= begin_Number and feature <= end_Number
2018-09-25 00:57:41 +03:00
cdef int is_num_form_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_NumForm and feature <= end_NumForm
2018-09-25 00:57:41 +03:00
cdef int is_num_type_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_NumType and feature <= end_NumType
2018-09-25 00:57:41 +03:00
cdef int is_num_value_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_NumValue and feature <= end_NumValue
2018-09-25 00:57:41 +03:00
cdef int is_part_form_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_PartForm and feature <= end_PartForm
2018-09-25 00:57:41 +03:00
cdef int is_part_type_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_PartType and feature <= end_PartType
2018-09-25 00:57:41 +03:00
cdef int is_person_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Person and feature <= end_Person
2018-09-25 00:57:41 +03:00
cdef int is_polite_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Polite and feature <= end_Polite
2018-09-25 00:57:41 +03:00
cdef int is_polarity_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Polarity and feature <= end_Polarity
2018-09-25 00:57:41 +03:00
cdef int is_poss_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Poss and feature <= end_Poss
2018-09-25 00:57:41 +03:00
cdef int is_prefix_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Prefix and feature <= end_Prefix
2018-09-25 00:57:41 +03:00
cdef int is_prep_case_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_PrepCase and feature <= end_PrepCase
2018-09-25 00:57:41 +03:00
cdef int is_pron_type_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_PronType and feature <= end_PronType
2018-09-25 00:57:41 +03:00
cdef int is_punct_side_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_PunctSide and feature <= end_PunctSide
2018-09-25 00:57:41 +03:00
cdef int is_punct_type_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_PunctType and feature <= end_PunctType
2018-09-25 00:57:41 +03:00
cdef int is_reflex_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Reflex and feature <= end_Reflex
2018-09-25 00:57:41 +03:00
cdef int is_style_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Style and feature <= end_Style
2018-09-25 00:57:41 +03:00
cdef int is_style_variant_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_StyleVariant and feature <= end_StyleVariant
2018-09-25 00:57:41 +03:00
cdef int is_tense_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Tense and feature <= end_Tense
cdef int is_typo_feature(univ_morph_t feature) nogil:
return feature >= begin_Typo and feature <= end_Typo
2018-09-25 00:57:41 +03:00
cdef int is_verb_form_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_VerbForm and feature <= end_VerbForm
2018-09-25 00:57:41 +03:00
cdef int is_voice_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_Voice and feature <= end_Voice
2018-09-25 00:57:41 +03:00
cdef int is_verb_type_feature(univ_morph_t feature) nogil:
2018-09-26 22:03:57 +03:00
return feature >= begin_VerbType and feature <= end_VerbType
2016-12-18 17:48:00 +03:00
FIELDS = {
'Abbr': 0,
'AdpType': 1,
'AdvType': 2,
'Animacy': 3,
'Aspect': 4,
'Case': 5,
'ConjType': 6,
'Connegative': 7,
'Definite': 8,
'Degree': 9,
'Derivation': 10,
'Echo': 11,
'Foreign': 12,
'Gender': 13,
'Hyph': 14,
'InfForm': 15,
'Mood': 16,
2018-09-26 22:03:57 +03:00
'NameType': 17,
'Negative': 18,
'Number': 19,
'NumForm': 20,
'NumType': 21,
'NumValue': 22,
'PartForm': 23,
'PartType': 24,
'Person': 25,
'Polite': 26,
'Polarity': 27,
'Poss': 28,
'Prefix': 29,
'PrepCase': 30,
'PronType': 31,
'PunctSide': 32,
'PunctType': 33,
'Reflex': 34,
'Style': 35,
'StyleVariant': 36,
'Tense': 37,
2018-09-26 22:03:57 +03:00
'Typo': 38,
'VerbForm': 39,
'Voice': 40,
'VerbType': 41
}
2018-09-26 00:03:43 +03:00
IDS = {
"begin_Abbr": begin_Abbr,
2018-09-26 22:03:57 +03:00
"Abbr_yes": Abbr_yes ,
2018-09-26 00:03:43 +03:00
"end_Abbr": end_Abbr,
"begin_AdpType": begin_AdpType,
"AdpType_circ": AdpType_circ,
"AdpType_comprep": AdpType_comprep,
"AdpType_prep ": AdpType_prep ,
"AdpType_post": AdpType_post,
"AdpType_voc": AdpType_voc,
"end_AdpType": end_AdpType,
"begin_AdvType": begin_AdvType,
"AdvType_adadj": AdvType_adadj,
"AdvType_cau": AdvType_cau,
"AdvType_deg": AdvType_deg,
"AdvType_ex": AdvType_ex,
"AdvType_loc": AdvType_loc,
"AdvType_man": AdvType_man,
"AdvType_mod": AdvType_mod,
"AdvType_sta": AdvType_sta,
"AdvType_tim": AdvType_tim,
"end_AdvType": end_AdvType,
"begin_Animacy": begin_Animacy,
"Animacy_anim": Animacy_anim,
"Animacy_hum": Animacy_hum,
"Animacy_inan": Animacy_inan,
"Animacy_nhum": Animacy_nhum,
"end_Animacy": end_Animacy,
"begin_Aspect": begin_Aspect,
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf,
"end_Aspect": end_Aspect,
"begin_Case": begin_Case,
"Case_abe": Case_abe,
"Case_abl": Case_abl,
"Case_abs": Case_abs,
"Case_acc": Case_acc,
"Case_ade": Case_ade,
"Case_all": Case_all,
"Case_cau": Case_cau,
"Case_com": Case_com,
"Case_dat": Case_dat,
"Case_del": Case_del,
"Case_dis": Case_dis,
"Case_ela": Case_ela,
"Case_ess": Case_ess,
"Case_gen": Case_gen,
"Case_ill": Case_ill,
"Case_ine": Case_ine,
"Case_ins": Case_ins,
"Case_loc": Case_loc,
"Case_lat": Case_lat,
"Case_nom": Case_nom,
"Case_par": Case_par,
"Case_sub": Case_sub,
"Case_sup": Case_sup,
"Case_tem": Case_tem,
"Case_ter": Case_ter,
"Case_tra": Case_tra,
"Case_voc": Case_voc,
"end_Case": end_Case,
"begin_ConjType": begin_ConjType,
"ConjType_comp ": ConjType_comp ,
"ConjType_oper": ConjType_oper,
"end_ConjType": end_ConjType,
"begin_Connegative": begin_Connegative,
"Connegative_yes": Connegative_yes,
"end_Connegative": end_Connegative,
"begin_Definite": begin_Definite,
"Definite_cons": Definite_cons,
"Definite_def": Definite_def,
"Definite_ind": Definite_ind,
"Definite_red": Definite_red,
"Definite_two": Definite_two,
"end_Definite": end_Definite,
"begin_Degree": begin_Degree,
"Degree_abs": Degree_abs,
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
"Degree_none": Degree_none,
"Degree_pos": Degree_pos,
"Degree_sup": Degree_sup,
"Degree_com": Degree_com,
"Degree_dim": Degree_dim,
"end_Degree": end_Degree,
2018-09-26 22:03:57 +03:00
"begin_Derivation": begin_Derivation,
"Derivation_minen": Derivation_minen,
"Derivation_sti": Derivation_sti,
"Derivation_inen": Derivation_inen,
"Derivation_lainen": Derivation_lainen,
"Derivation_ja": Derivation_ja,
"Derivation_ton": Derivation_ton,
"Derivation_vs": Derivation_vs,
"Derivation_ttain": Derivation_ttain,
"Derivation_ttaa": Derivation_ttaa,
"end_Derivation": end_Derivation,
"begin_Echo": begin_Echo,
"Echo_rdp": Echo_rdp,
"Echo_ech": Echo_ech,
"end_Echo": end_Echo,
"begin_Foreign": begin_Foreign,
"Foreign_foreign": Foreign_foreign,
"Foreign_fscript": Foreign_fscript,
"Foreign_tscript": Foreign_tscript,
"Foreign_yes": Foreign_yes,
"end_Foreign": end_Foreign,
2018-09-26 00:03:43 +03:00
"begin_Gender": begin_Gender,
"Gender_com": Gender_com,
"Gender_fem": Gender_fem,
"Gender_masc": Gender_masc,
"Gender_neut": Gender_neut,
"Gender_dat_masc": Gender_dat_masc,
"Gender_dat_fem": Gender_dat_fem,
"Gender_erg_masc": Gender_erg_masc,
"Gender_erg_fem": Gender_erg_fem,
"Gender_psor_masc": Gender_psor_masc,
"Gender_psor_fem": Gender_psor_fem,
"Gender_psor_neut": Gender_psor_neut,
"end_Gender": end_Gender,
2018-09-26 22:03:57 +03:00
"begin_Hyph": begin_Hyph,
"Hyph_yes": Hyph_yes,
"end_Hyph": end_Hyph,
"begin_InfForm": begin_InfForm,
"InfForm_one": InfForm_one,
"InfForm_two": InfForm_two,
"InfForm_three": InfForm_three,
"end_InfForm": end_InfForm,
2018-09-26 00:03:43 +03:00
"begin_Mood": begin_Mood,
"Mood_cnd": Mood_cnd,
"Mood_imp": Mood_imp,
"Mood_ind": Mood_ind,
"Mood_n": Mood_n,
"Mood_pot": Mood_pot,
"Mood_sub": Mood_sub,
"Mood_opt": Mood_opt,
"end_Mood": end_Mood,
2018-09-26 22:03:57 +03:00
"begin_NameType": begin_NameType,
"NameType_geo": NameType_geo,
"NameType_prs": NameType_prs,
"NameType_giv": NameType_giv,
"NameType_sur": NameType_sur,
"NameType_nat": NameType_nat,
"NameType_com": NameType_com,
"NameType_pro": NameType_pro,
"NameType_oth": NameType_oth,
"end_NameType": end_NameType,
2018-09-26 00:03:43 +03:00
"begin_Negative": begin_Negative,
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"end_Negative": end_Negative,
2018-09-26 22:03:57 +03:00
"begin_NounType": begin_NounType,
"NounType_com": NounType_com,
"NounType_prop": NounType_prop,
"NounType_class": NounType_class,
"end_NounType": end_NounType,
2018-09-26 00:03:43 +03:00
"begin_Number": begin_Number,
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
"Number_plur": Number_plur,
"Number_sing": Number_sing,
"Number_ptan": Number_ptan,
"Number_count": Number_count,
"Number_abs_sing": Number_abs_sing,
"Number_abs_plur": Number_abs_plur,
"Number_dat_sing": Number_dat_sing,
"Number_dat_plur": Number_dat_plur,
"Number_erg_sing": Number_erg_sing,
"Number_erg_plur": Number_erg_plur,
"Number_psee_sing": Number_psee_sing,
"Number_psee_plur": Number_psee_plur,
"Number_psor_sing": Number_psor_sing,
"Number_psor_plur": Number_psor_plur,
"end_Number": end_Number,
2018-09-26 22:03:57 +03:00
"begin_NumForm": begin_NumForm,
"NumForm_digit": NumForm_digit,
"NumForm_roman": NumForm_roman,
"NumForm_word": NumForm_word,
"end_NumForm": end_NumForm,
2018-09-26 00:03:43 +03:00
"begin_NumType": begin_NumType,
"NumType_card": NumType_card,
"NumType_dist": NumType_dist,
"NumType_frac": NumType_frac,
"NumType_gen": NumType_gen,
"NumType_mult": NumType_mult,
"NumType_none": NumType_none,
"NumType_ord": NumType_ord,
"NumType_sets": NumType_sets,
"end_NumType": end_NumType,
2018-09-26 22:03:57 +03:00
"begin_NumValue": begin_NumValue,
"NumValue_one": NumValue_one,
"NumValue_two": NumValue_two,
"NumValue_three": NumValue_three,
"end_NumValue": end_NumValue,
"begin_PartForm": begin_PartForm,
"PartForm_pres": PartForm_pres,
"PartForm_past": PartForm_past,
"PartForm_agt": PartForm_agt,
"PartForm_neg": PartForm_neg,
"end_PartForm": end_PartForm,
"begin_PartType": begin_PartType,
"PartType_mod": PartType_mod,
"PartType_emp": PartType_emp,
"PartType_res": PartType_res,
"PartType_inf": PartType_inf,
"PartType_vbp": PartType_vbp,
"end_PartType": end_PartType,
2018-09-26 00:03:43 +03:00
"begin_Person": begin_Person,
"Person_one": Person_one,
"Person_two": Person_two,
"Person_three": Person_three,
"Person_none": Person_none,
"Person_abs_one": Person_abs_one,
"Person_abs_two": Person_abs_two,
"Person_abs_three": Person_abs_three,
"Person_dat_one": Person_dat_one,
"Person_dat_two": Person_dat_two,
"Person_dat_three": Person_dat_three,
"Person_erg_one": Person_erg_one,
"Person_erg_two": Person_erg_two,
"Person_erg_three": Person_erg_three,
"Person_psor_one": Person_psor_one,
"Person_psor_two": Person_psor_two,
"Person_psor_three": Person_psor_three,
"end_Person": end_Person,
2018-09-26 22:03:57 +03:00
"begin_Polarity": begin_Polarity,
"Polarity_neg": Polarity_neg,
"Polarity_pos": Polarity_pos,
"end_Polarity": end_Polarity,
"begin_Polite": begin_Polite,
"Polite_inf": Polite_inf,
"Polite_pol": Polite_pol,
"Polite_abs_inf": Polite_abs_inf,
"Polite_abs_pol": Polite_abs_pol,
"Polite_erg_inf": Polite_erg_inf,
"Polite_erg_pol": Polite_erg_pol,
"Polite_dat_inf": Polite_dat_inf,
"Polite_dat_pol": Polite_dat_pol,
"end_Polite": end_Polite,
2018-09-26 00:03:43 +03:00
"begin_Poss": begin_Poss,
"Poss_yes": Poss_yes,
"end_Poss": end_Poss,
2018-09-26 22:03:57 +03:00
"begin_Prefix": begin_Prefix,
"Prefix_yes": Prefix_yes,
"end_Prefix": end_Prefix,
"begin_PrepCase": begin_PrepCase,
"PrepCase_npr": PrepCase_npr,
"PrepCase_pre": PrepCase_pre,
"end_PrepCase": end_PrepCase,
2018-09-26 00:03:43 +03:00
"begin_PronType": begin_PronType,
"PronType_advPart": PronType_advPart,
"PronType_art": PronType_art,
"PronType_default": PronType_default,
"PronType_dem": PronType_dem,
"PronType_ind": PronType_ind,
"PronType_int": PronType_int,
"PronType_neg": PronType_neg,
"PronType_prs": PronType_prs,
"PronType_rcp": PronType_rcp,
"PronType_rel": PronType_rel,
"PronType_tot": PronType_tot,
"PronType_clit": PronType_clit,
"PronType_exc": PronType_exc,
"end_PronType": end_PronType,
"begin_PunctSide": begin_PunctSide,
"PunctSide_ini": PunctSide_ini,
"PunctSide_fin": PunctSide_fin,
"end_PunctSide": end_PunctSide,
"begin_PunctType": begin_PunctType,
"PunctType_peri": PunctType_peri,
"PunctType_qest": PunctType_qest,
"PunctType_excl": PunctType_excl,
"PunctType_quot": PunctType_quot,
"PunctType_brck": PunctType_brck,
"PunctType_comm": PunctType_comm,
"PunctType_colo": PunctType_colo,
"PunctType_semi": PunctType_semi,
"PunctType_dash": PunctType_dash,
"end_PunctType": end_PunctType,
2018-09-26 22:03:57 +03:00
"begin_Reflex": begin_Reflex,
"Reflex_yes": Reflex_yes,
"end_Reflex": end_Reflex,
2018-09-26 00:03:43 +03:00
"begin_Style": begin_Style,
"Style_arch": Style_arch,
"Style_rare": Style_rare,
"Style_poet": Style_poet,
"Style_norm": Style_norm,
"Style_coll": Style_coll,
"Style_vrnc": Style_vrnc,
"Style_sing": Style_sing,
"Style_expr": Style_expr,
"Style_derg": Style_derg,
"Style_vulg": Style_vulg,
"Style_yes": Style_yes,
"end_Style": end_Style,
"begin_StyleVariant": begin_StyleVariant,
"StyleVariant_styleShort": StyleVariant_styleShort,
"StyleVariant_styleBound": StyleVariant_styleBound,
"end_StyleVariant": end_StyleVariant,
2018-09-26 22:03:57 +03:00
"begin_Tense": begin_Tense,
"Tense_fut": Tense_fut,
"Tense_imp": Tense_imp,
"Tense_past": Tense_past,
"Tense_pres": Tense_pres,
"end_Tense": end_Tense,
"begin_Typo": begin_Typo,
"Typo_yes": Typo_yes,
"end_Typo": end_Typo,
"begin_VerbForm": begin_VerbForm,
"VerbForm_fin": VerbForm_fin,
"VerbForm_ger": VerbForm_ger,
"VerbForm_inf": VerbForm_inf,
"VerbForm_none": VerbForm_none,
"VerbForm_part": VerbForm_part,
"VerbForm_partFut": VerbForm_partFut,
"VerbForm_partPast": VerbForm_partPast,
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv,
"VerbForm_gdv": VerbForm_gdv,
"end_VerbForm": end_VerbForm,
2018-09-26 00:03:43 +03:00
"begin_VerbType": begin_VerbType,
"VerbType_aux": VerbType_aux,
"VerbType_cop": VerbType_cop,
"VerbType_mod": VerbType_mod,
"VerbType_light": VerbType_light,
"end_VerbType": end_VerbType,
2018-09-26 22:03:57 +03:00
"begin_Voice": begin_Voice,
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,
"Voice_pass": Voice_pass,
"Voice_mid": Voice_mid,
"Voice_int": Voice_int,
"end_Voice": end_Voice,
2018-09-26 00:03:43 +03:00
}
FIELD_SIZES = [get_field_size(field) for field in FIELDS]
NAMES = {value: key for key, value in IDS.items()}
# Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+)
# We keep the enum cdef, and just make sure the names are available to Python
locals().update(IDS)