spaCy/spacy/morphology.pyx

1086 lines
30 KiB
Cython
Raw Normal View History

# cython: infer_types
# coding: utf8
from __future__ import unicode_literals
from libc.string cimport memset
2019-03-07 03:17:19 +03:00
import srsly
2019-03-07 19:14:57 +03:00
from collections import Counter
2019-03-07 23:58:43 +03:00
from .compat import basestring_
2019-03-07 03:17:19 +03:00
from .strings import get_string_id
2018-09-25 16:18:21 +03:00
from . import symbols
from .attrs cimport POS, IS_SPACE
2017-10-27 22:07:59 +03:00
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
2016-01-19 05:36:51 +03:00
from .lexeme cimport Lexeme
from .errors import Errors
2015-08-28 00:11:51 +03:00
2019-03-07 20:32:36 +03:00
2019-03-07 19:14:57 +03:00
cdef enum univ_field_t:
Field_Abbr
Field_AdpType
Field_AdvType
Field_Animacy
Field_Aspect
Field_Case
Field_ConjType
Field_Connegative
Field_Definite
Field_Degree
Field_Derivation
Field_Echo
Field_Foreign
Field_Gender
Field_Hyph
Field_InfForm
Field_Mood
Field_NameType
Field_Negative
Field_NounType
Field_Number
Field_NumForm
Field_NumType
Field_NumValue
Field_PartForm
Field_PartType
Field_Person
Field_Polite
Field_Polarity
Field_Poss
Field_Prefix
Field_PrepCase
Field_PronType
Field_PunctSide
Field_PunctType
Field_Reflex
Field_Style
Field_StyleVariant
Field_Tense
Field_Typo
Field_VerbForm
Field_Voice
Field_VerbType
2015-08-28 00:11:51 +03:00
2018-09-25 00:57:41 +03:00
def _normalize_props(props):
2017-10-27 22:07:59 +03:00
"""Transform deprecated string keys to correct names."""
out = {}
2018-09-25 16:18:21 +03:00
props = dict(props)
2018-09-26 22:03:57 +03:00
for key in FIELDS:
2018-09-25 16:18:21 +03:00
if key in props:
2019-03-07 23:58:43 +03:00
value = str(props[key]).lower()
attr = '%s_%s' % (key, value)
2019-03-07 19:14:57 +03:00
if attr in FEATURES:
2018-09-25 16:18:21 +03:00
props.pop(key)
props[attr] = True
for key, value in props.items():
if key == POS:
if hasattr(value, 'upper'):
value = value.upper()
if value in POS_IDS:
value = POS_IDS[value]
out[key] = value
2017-03-25 23:56:41 +03:00
elif isinstance(key, int):
out[key] = value
2019-03-07 23:58:43 +03:00
elif value is True:
out[key] = value
elif key.lower() == 'pos':
out[POS] = POS_IDS[value.upper()]
2019-03-07 23:58:43 +03:00
elif key.lower() != 'morph':
out[key] = value
return out
def parse_feature(feature):
2019-03-07 19:14:57 +03:00
field = FEATURE_FIELDS[feature]
offset = FEATURE_OFFSETS[feature]
return (field, offset)
cdef int attribute_to_field(unicode attribute_name):
return LOWER_FIELDS[attribute_name]
2019-03-07 19:14:57 +03:00
def get_field_id(feature):
return FEATURE_FIELDS[feature]
2018-09-26 00:03:43 +03:00
def get_field_size(field):
2019-03-08 20:54:25 +03:00
return FIELD_SIZES[FIELDS[field]]
2019-03-07 19:14:57 +03:00
def get_field_offset(field):
2019-03-08 20:58:26 +03:00
return FIELD_OFFSETS[FIELDS[field]]
2018-09-26 00:03:43 +03:00
cdef class Morphology:
2018-09-25 00:57:41 +03:00
'''Store the possible morphological analyses for a language, and index them
by hash.
To save space on each token, tokens only know the hash of their morphological
analysis, so queries of morphological attributes are delegated
to this class.
'''
2017-06-05 00:34:32 +03:00
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool()
self.strings = string_store
2018-09-25 00:57:41 +03:00
self.tags = PreshMap()
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
space_attrs = tag_map.get('SP', {POS: SPACE})
if '_SP' not in tag_map:
self.strings.add('_SP')
tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs
self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_map = {}
self.lemmatizer = lemmatizer
2017-06-03 21:29:23 +03:00
self.n_tags = len(tag_map)
self.reverse_index = {}
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
2018-09-25 16:18:21 +03:00
attrs = _normalize_props(attrs)
2019-03-07 23:58:43 +03:00
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
2017-03-25 23:56:41 +03:00
self.tag_map[tag_str] = dict(attrs)
2018-09-25 11:57:33 +03:00
self.reverse_index[self.strings.add(tag_str)] = i
2017-10-11 11:33:06 +03:00
self._cache = PreshMapArray(self.n_tags)
2017-06-05 00:34:32 +03:00
self.exc = {}
if exc is not None:
2018-09-25 16:18:21 +03:00
for (tag, orth), attrs in exc.items():
2019-03-07 20:32:36 +03:00
attrs = _normalize_props(attrs)
2018-09-25 16:18:21 +03:00
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
2018-09-25 11:57:33 +03:00
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
2018-09-25 00:57:41 +03:00
def add(self, features):
"""Insert a morphological analysis in the morphology table, if not already
present. Returns the hash of the new analysis.
"""
2019-03-07 20:32:36 +03:00
for f in features:
2019-03-07 23:58:43 +03:00
if isinstance(f, basestring_):
self.strings.add(f)
2018-09-25 21:53:24 +03:00
features = intify_features(features)
2019-03-07 19:14:57 +03:00
cdef attr_t feature
2018-09-26 22:03:57 +03:00
for feature in features:
2019-03-07 19:14:57 +03:00
if feature != 0 and feature not in FEATURE_NAMES:
2019-03-07 20:32:36 +03:00
raise KeyError("Unknown feature: %s" % self.strings[feature])
2019-03-07 16:03:07 +03:00
cdef MorphAnalysisC tag
2018-09-26 22:03:57 +03:00
tag = create_rich_tag(features)
2018-09-25 00:57:41 +03:00
cdef hash_t key = self.insert(tag)
return key
2018-09-26 22:03:57 +03:00
def get(self, hash_t morph):
2019-03-07 16:03:07 +03:00
tag = <MorphAnalysisC*>self.tags.get(morph)
2018-09-26 22:03:57 +03:00
if tag == NULL:
return []
else:
2019-03-08 02:08:35 +03:00
return tag_to_json(tag)
2018-09-25 21:53:24 +03:00
2018-09-25 22:07:08 +03:00
cpdef update(self, hash_t morph, features):
2018-09-25 21:53:24 +03:00
"""Update a morphological analysis with new feature values."""
2019-03-07 16:03:07 +03:00
tag = (<MorphAnalysisC*>self.tags.get(morph))[0]
2018-09-25 22:07:08 +03:00
features = intify_features(features)
2019-03-07 19:14:57 +03:00
cdef attr_t feature
2018-09-25 22:07:08 +03:00
for feature in features:
2019-03-07 19:14:57 +03:00
field = get_field_id(feature)
set_feature(&tag, field, feature, 1)
2018-09-25 22:07:08 +03:00
morph = self.insert(tag)
2018-09-25 21:53:24 +03:00
return morph
2018-09-25 00:57:41 +03:00
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
return orth
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings.add(py_string.lower())
cdef list lemma_strings
cdef unicode lemma_string
# Normalize features into a dict keyed by the field, to make life easier
# for the lemmatizer. Handles string-to-int conversion too.
string_feats = {}
for key, value in morphology.items():
if value is True:
name, value = self.strings.as_string(key).split('_', 1)
string_feats[name] = value
else:
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
2018-09-25 00:57:41 +03:00
lemma_string = lemma_strings[0]
lemma = self.strings.add(lemma_string)
return lemma
2018-09-25 11:57:33 +03:00
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
2018-09-25 16:18:21 +03:00
attrs = dict(attrs)
attrs = _normalize_props(attrs)
2019-03-07 23:58:43 +03:00
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
2018-09-25 16:18:21 +03:00
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
2018-09-25 00:57:41 +03:00
2019-03-07 16:03:07 +03:00
cdef hash_t insert(self, MorphAnalysisC tag) except 0:
2018-09-25 00:57:41 +03:00
cdef hash_t key = hash_tag(tag)
if self.tags.get(key) == NULL:
2019-03-07 16:03:07 +03:00
tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
2018-09-25 00:57:41 +03:00
tag_ptr[0] = tag
self.tags.set(key, <void*>tag_ptr)
return key
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
2017-10-27 22:07:59 +03:00
table provided by the language data as lemma_lookup (if available).
"""
2017-10-11 11:33:06 +03:00
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str)
token.lemma = self.strings.add(lemma)
2018-09-25 11:57:33 +03:00
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
cdef attr_t tag = self.strings.as_int(tag_str)
2017-05-28 15:06:40 +03:00
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
2017-05-28 15:06:40 +03:00
self.assign_tag_id(token, tag_id)
else:
token.tag = tag
2016-11-04 21:19:09 +03:00
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError(Errors.E014.format(tag=tag_id))
# Ensure spaces get tagged as space.
# It seems pretty arbitrary to put this logic here, but there's really
# nowhere better. I guess the justification is that this is where the
# specific word and the tag interact. Still, we should have a better
# way to enforce this rule, or figure out why the statistical model fails.
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
2018-09-25 01:35:59 +03:00
tag_str = self.tag_names[tag_id]
features = dict(self.tag_map.get(tag_str, {}))
2018-09-25 16:18:21 +03:00
if features:
2018-09-25 11:57:33 +03:00
pos = self.strings.as_int(features.pop(POS))
else:
pos = 0
2018-09-25 16:18:21 +03:00
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
if lemma == 0:
2019-03-07 14:14:37 +03:00
# Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
2018-09-25 16:18:21 +03:00
lemma = self.lemmatize(pos, token.lex.orth, features)
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
2018-09-25 00:57:41 +03:00
token.lemma = lemma
2018-09-25 11:57:33 +03:00
token.pos = <univ_pos_t>pos
2018-09-25 00:57:41 +03:00
token.tag = self.strings[tag_str]
2019-03-07 20:32:36 +03:00
token.morph = self.add(features)
2018-09-25 16:18:21 +03:00
if (self.tag_names[tag_id], token.lex.orth) in self.exc:
self._assign_tag_from_exceptions(token, tag_id)
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
key = (self.tag_names[tag_id], token.lex.orth)
cdef dict attrs
attrs = self.exc[key]
token.pos = attrs.get(POS, token.pos)
token.lemma = attrs.get(LEMMA, token.lemma)
2018-09-25 16:18:21 +03:00
def load_morph_exceptions(self, dict exc):
# Map (form, pos) to attributes
2018-09-25 16:18:21 +03:00
for tag_str, entries in exc.items():
for form_str, attrs in entries.items():
self.add_special_case(tag_str, form_str, attrs)
2018-09-25 00:57:41 +03:00
def to_bytes(self):
json_tags = []
for key in self.tags:
2019-03-07 16:03:07 +03:00
tag_ptr = <MorphAnalysisC*>self.tags.get(key)
2018-09-25 00:57:41 +03:00
if tag_ptr != NULL:
2019-03-08 02:08:35 +03:00
json_tags.append(tag_to_json(tag_ptr))
2019-03-07 03:17:19 +03:00
return srsly.json_dumps(json_tags)
2018-09-25 00:57:41 +03:00
def from_bytes(self, byte_string):
raise NotImplementedError
def to_disk(self, path):
raise NotImplementedError
def from_disk(self, path):
raise NotImplementedError
cpdef univ_pos_t get_int_tag(pos_):
return <univ_pos_t>0
2018-09-25 21:53:24 +03:00
cpdef intify_features(features):
2019-03-07 19:14:57 +03:00
return {get_string_id(feature) for feature in features}
2018-09-25 00:57:41 +03:00
2019-03-07 16:03:07 +03:00
cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
2018-09-25 00:57:41 +03:00
return mrmr.hash64(&tag, sizeof(tag), 0)
2019-03-07 19:14:57 +03:00
def get_feature_field(feature):
cdef attr_t key = get_string_id(feature)
return FEATURE_FIELDS[feature]
2019-03-07 16:03:07 +03:00
cdef MorphAnalysisC create_rich_tag(features) except *:
cdef MorphAnalysisC tag
2019-03-07 19:14:57 +03:00
cdef attr_t feature
2018-09-25 22:07:08 +03:00
memset(&tag, 0, sizeof(tag))
for feature in features:
2019-03-07 19:14:57 +03:00
field = get_field_id(feature)
set_feature(&tag, field, feature, 1)
2018-09-25 00:57:41 +03:00
return tag
2019-03-07 19:14:57 +03:00
2019-03-08 02:08:35 +03:00
cdef tag_to_json(const MorphAnalysisC* tag):
return [FEATURE_NAMES[f] for f in list_features(tag)]
cdef MorphAnalysisC tag_from_json(json_tag):
raise NotImplementedError
cdef list list_features(const MorphAnalysisC* tag):
output = []
2018-09-26 22:03:57 +03:00
if tag.abbr != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.abbr)
2018-09-26 22:03:57 +03:00
if tag.adp_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.adp_type)
2018-09-26 22:03:57 +03:00
if tag.adv_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.adv_type)
2018-09-26 22:03:57 +03:00
if tag.animacy != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.animacy)
2018-09-26 22:03:57 +03:00
if tag.aspect != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.aspect)
2018-09-26 22:03:57 +03:00
if tag.case != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.case)
2018-09-26 22:03:57 +03:00
if tag.conj_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.conj_type)
2018-09-26 22:03:57 +03:00
if tag.connegative != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.connegative)
2018-09-26 22:03:57 +03:00
if tag.definite != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.definite)
2018-09-26 22:03:57 +03:00
if tag.degree != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.degree)
2018-09-26 22:03:57 +03:00
if tag.derivation != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.derivation)
2018-09-26 22:03:57 +03:00
if tag.echo != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.echo)
2018-09-26 22:03:57 +03:00
if tag.foreign != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.foreign)
2018-09-26 22:03:57 +03:00
if tag.gender != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.gender)
2018-09-26 22:03:57 +03:00
if tag.hyph != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.hyph)
2018-09-26 22:03:57 +03:00
if tag.inf_form != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.inf_form)
2018-09-26 22:03:57 +03:00
if tag.mood != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.mood)
2018-09-26 22:03:57 +03:00
if tag.negative != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.negative)
2018-09-26 22:03:57 +03:00
if tag.number != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.number)
2018-09-26 22:03:57 +03:00
if tag.name_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.name_type)
2018-09-26 22:03:57 +03:00
if tag.noun_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.noun_type)
2018-09-26 22:03:57 +03:00
if tag.part_form != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.part_form)
2018-09-26 22:03:57 +03:00
if tag.part_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.part_type)
2018-09-26 22:03:57 +03:00
if tag.person != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.person)
2018-09-26 22:03:57 +03:00
if tag.polite != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.polite)
2018-09-26 22:03:57 +03:00
if tag.polarity != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.polarity)
2018-09-26 22:03:57 +03:00
if tag.poss != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.poss)
2018-09-26 22:03:57 +03:00
if tag.prefix != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.prefix)
2018-09-26 22:03:57 +03:00
if tag.prep_case != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.prep_case)
2018-09-26 22:03:57 +03:00
if tag.pron_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.pron_type)
2018-09-26 22:03:57 +03:00
if tag.punct_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.punct_type)
2018-09-26 22:03:57 +03:00
if tag.reflex != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.reflex)
2018-09-26 22:03:57 +03:00
if tag.style != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.style)
2018-09-26 22:03:57 +03:00
if tag.style_variant != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.style_variant)
if tag.typo != 0:
output.append(tag.typo)
2018-09-26 22:03:57 +03:00
if tag.verb_form != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.verb_form)
2018-09-26 22:03:57 +03:00
if tag.voice != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.voice)
2018-09-26 22:03:57 +03:00
if tag.verb_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.verb_type)
return output
2018-09-25 00:57:41 +03:00
2019-03-07 20:32:36 +03:00
2019-03-08 02:08:35 +03:00
cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
field = <univ_field_t>field_id
if field == Field_Abbr:
return tag.abbr
elif field == Field_AdpType:
return tag.adp_type
elif field == Field_AdvType:
return tag.adv_type
elif field == Field_Animacy:
return tag.animacy
elif field == Field_Aspect:
return tag.aspect
elif field == Field_Case:
return tag.case
elif field == Field_ConjType:
return tag.conj_type
elif field == Field_Connegative:
return tag.connegative
elif field == Field_Definite:
return tag.definite
elif field == Field_Degree:
return tag.degree
elif field == Field_Derivation:
return tag.derivation
elif field == Field_Echo:
return tag.echo
elif field == Field_Foreign:
return tag.foreign
elif field == Field_Gender:
return tag.gender
elif field == Field_Hyph:
return tag.hyph
elif field == Field_InfForm:
return tag.inf_form
elif field == Field_Mood:
return tag.mood
elif field == Field_Negative:
return tag.negative
elif field == Field_Number:
return tag.number
elif field == Field_NameType:
return tag.name_type
elif field == Field_NounType:
return tag.noun_type
elif field == Field_NumForm:
return tag.num_form
elif field == Field_NumType:
return tag.num_type
elif field == Field_NumValue:
return tag.num_value
elif field == Field_PartForm:
return tag.part_form
elif field == Field_PartType:
return tag.part_type
elif field == Field_Person:
return tag.person
elif field == Field_Polite:
return tag.polite
elif field == Field_Polarity:
return tag.polarity
elif field == Field_Poss:
return tag.poss
elif field == Field_Prefix:
return tag.prefix
elif field == Field_PrepCase:
return tag.prep_case
elif field == Field_PronType:
return tag.pron_type
elif field == Field_PunctSide:
return tag.punct_side
elif field == Field_PunctType:
return tag.punct_type
elif field == Field_Reflex:
return tag.reflex
elif field == Field_Style:
return tag.style
elif field == Field_StyleVariant:
return tag.style_variant
elif field == Field_Tense:
return tag.tense
elif field == Field_Typo:
return tag.typo
elif field == Field_VerbForm:
return tag.verb_form
elif field == Field_Voice:
return tag.voice
elif field == Field_VerbType:
return tag.verb_type
else:
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
2019-03-07 20:32:36 +03:00
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
if tag.abbr == feature:
return 1
elif tag.adp_type == feature:
return 1
elif tag.adv_type == feature:
return 1
elif tag.animacy == feature:
return 1
elif tag.aspect == feature:
return 1
elif tag.case == feature:
return 1
elif tag.conj_type == feature:
return 1
elif tag.connegative == feature:
return 1
elif tag.definite == feature:
return 1
elif tag.degree == feature:
return 1
elif tag.derivation == feature:
return 1
elif tag.echo == feature:
return 1
elif tag.foreign == feature:
return 1
elif tag.gender == feature:
return 1
elif tag.hyph == feature:
return 1
elif tag.inf_form == feature:
return 1
elif tag.mood == feature:
return 1
elif tag.negative == feature:
return 1
elif tag.number == feature:
return 1
elif tag.name_type == feature:
return 1
elif tag.noun_type == feature:
return 1
elif tag.num_form == feature:
return 1
elif tag.num_type == feature:
return 1
elif tag.num_value == feature:
return 1
elif tag.part_form == feature:
return 1
elif tag.part_type == feature:
return 1
elif tag.person == feature:
return 1
elif tag.polite == feature:
return 1
elif tag.polarity == feature:
return 1
elif tag.poss == feature:
return 1
elif tag.prefix == feature:
return 1
elif tag.prep_case == feature:
return 1
elif tag.pron_type == feature:
return 1
elif tag.punct_side == feature:
return 1
elif tag.punct_type == feature:
return 1
elif tag.reflex == feature:
return 1
elif tag.style == feature:
return 1
elif tag.style_variant == feature:
return 1
elif tag.tense == feature:
return 1
elif tag.typo == feature:
return 1
elif tag.verb_form == feature:
return 1
elif tag.voice == feature:
return 1
elif tag.verb_type == feature:
return 1
else:
return 0
2018-09-25 00:57:41 +03:00
2019-03-07 19:14:57 +03:00
cdef int set_feature(MorphAnalysisC* tag,
univ_field_t field, attr_t feature, int value) except -1:
2018-09-25 00:57:41 +03:00
if value == True:
value_ = feature
else:
2019-03-07 19:14:57 +03:00
value_ = 0
2019-03-08 02:08:35 +03:00
prev_value = get_field(tag, field)
if prev_value != 0 and value_ == 0:
tag.length -= 1
elif prev_value == 0 and value_ != 0:
tag.length += 1
2019-03-07 19:14:57 +03:00
if feature == 0:
2018-09-25 00:57:41 +03:00
pass
2019-03-07 19:14:57 +03:00
elif field == Field_Abbr:
2018-09-25 00:57:41 +03:00
tag.abbr = value_
2019-03-07 19:14:57 +03:00
elif field == Field_AdpType:
2018-09-25 00:57:41 +03:00
tag.adp_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_AdvType:
2018-09-25 00:57:41 +03:00
tag.adv_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Animacy:
2018-09-25 00:57:41 +03:00
tag.animacy = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Aspect:
2018-09-25 00:57:41 +03:00
tag.aspect = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Case:
2018-09-25 00:57:41 +03:00
tag.case = value_
2019-03-07 19:14:57 +03:00
elif field == Field_ConjType:
2018-09-25 00:57:41 +03:00
tag.conj_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Connegative:
2018-09-25 00:57:41 +03:00
tag.connegative = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Definite:
2018-09-25 00:57:41 +03:00
tag.definite = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Degree:
2018-09-25 00:57:41 +03:00
tag.degree = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Derivation:
2018-09-25 00:57:41 +03:00
tag.derivation = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Echo:
2018-09-25 00:57:41 +03:00
tag.echo = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Foreign:
2018-09-25 00:57:41 +03:00
tag.foreign = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Gender:
2018-09-25 00:57:41 +03:00
tag.gender = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Hyph:
2018-09-25 00:57:41 +03:00
tag.hyph = value_
2019-03-07 19:14:57 +03:00
elif field == Field_InfForm:
2018-09-25 00:57:41 +03:00
tag.inf_form = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Mood:
2018-09-25 00:57:41 +03:00
tag.mood = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Negative:
2018-09-25 00:57:41 +03:00
tag.negative = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Number:
2018-09-25 00:57:41 +03:00
tag.number = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NameType:
2018-09-25 00:57:41 +03:00
tag.name_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NounType:
2018-09-26 22:03:57 +03:00
tag.noun_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NumForm:
2018-09-25 00:57:41 +03:00
tag.num_form = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NumType:
2018-09-26 22:03:57 +03:00
tag.num_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NumValue:
2018-09-25 00:57:41 +03:00
tag.num_value = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PartForm:
2018-09-25 00:57:41 +03:00
tag.part_form = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PartType:
2018-09-25 00:57:41 +03:00
tag.part_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Person:
2018-09-25 00:57:41 +03:00
tag.person = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Polite:
2018-09-25 00:57:41 +03:00
tag.polite = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Polarity:
2018-09-25 00:57:41 +03:00
tag.polarity = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Poss:
2018-09-25 00:57:41 +03:00
tag.poss = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Prefix:
2018-09-25 00:57:41 +03:00
tag.prefix = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PrepCase:
2018-09-25 00:57:41 +03:00
tag.prep_case = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PronType:
2018-09-25 00:57:41 +03:00
tag.pron_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PunctSide:
2018-09-26 22:03:57 +03:00
tag.punct_side = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PunctType:
2018-09-25 00:57:41 +03:00
tag.punct_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Reflex:
2018-09-25 00:57:41 +03:00
tag.reflex = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Style:
2018-09-25 00:57:41 +03:00
tag.style = value_
2019-03-07 19:14:57 +03:00
elif field == Field_StyleVariant:
2018-09-25 00:57:41 +03:00
tag.style_variant = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Tense:
2018-09-25 00:57:41 +03:00
tag.tense = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Typo:
2018-09-26 22:03:57 +03:00
tag.typo = value_
2019-03-07 19:14:57 +03:00
elif field == Field_VerbForm:
2018-09-25 00:57:41 +03:00
tag.verb_form = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Voice:
2018-09-25 00:57:41 +03:00
tag.voice = value_
2019-03-07 19:14:57 +03:00
elif field == Field_VerbType:
2018-09-25 00:57:41 +03:00
tag.verb_type = value_
else:
2019-03-07 19:14:57 +03:00
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
2016-12-18 17:48:00 +03:00
FIELDS = {
2019-03-07 19:14:57 +03:00
'Abbr': Field_Abbr,
'AdpType': Field_AdpType,
'AdvType': Field_AdvType,
'Animacy': Field_Animacy,
'Aspect': Field_Aspect,
'Case': Field_Case,
'ConjType': Field_ConjType,
'Connegative': Field_Connegative,
'Definite': Field_Definite,
'Degree': Field_Degree,
'Derivation': Field_Derivation,
'Echo': Field_Echo,
'Foreign': Field_Foreign,
'Gender': Field_Gender,
'Hyph': Field_Hyph,
'InfForm': Field_InfForm,
'Mood': Field_Mood,
'NameType': Field_NameType,
'Negative': Field_Negative,
'NounType': Field_NounType,
'Number': Field_Number,
'NumForm': Field_NumForm,
'NumType': Field_NumType,
'NumValue': Field_NumValue,
'PartForm': Field_PartForm,
'PartType': Field_PartType,
'Person': Field_Person,
'Polite': Field_Polite,
'Polarity': Field_Polarity,
'Poss': Field_Poss,
'Prefix': Field_Prefix,
'PrepCase': Field_PrepCase,
'PronType': Field_PronType,
'PunctSide': Field_PunctSide,
'PunctType': Field_PunctType,
'Reflex': Field_Reflex,
'Style': Field_Style,
'StyleVariant': Field_StyleVariant,
'Tense': Field_Tense,
'Typo': Field_Typo,
'VerbForm': Field_VerbForm,
'Voice': Field_Voice,
'VerbType': Field_VerbType
2018-09-26 00:03:43 +03:00
}
LOWER_FIELDS = {
'abbr': Field_Abbr,
'adp_type': Field_AdpType,
'adv_type': Field_AdvType,
'animacy': Field_Animacy,
'aspect': Field_Aspect,
'case': Field_Case,
'conj_type': Field_ConjType,
'connegative': Field_Connegative,
'definite': Field_Definite,
'degree': Field_Degree,
'derivation': Field_Derivation,
'echo': Field_Echo,
'foreign': Field_Foreign,
'gender': Field_Gender,
'hyph': Field_Hyph,
'inf_form': Field_InfForm,
'mood': Field_Mood,
'name_type': Field_NameType,
'negative': Field_Negative,
'noun_type': Field_NounType,
'number': Field_Number,
'num_form': Field_NumForm,
'num_type': Field_NumType,
'num_value': Field_NumValue,
'part_form': Field_PartForm,
'part_type': Field_PartType,
'person': Field_Person,
'polite': Field_Polite,
'polarity': Field_Polarity,
'poss': Field_Poss,
'prefix': Field_Prefix,
'prep_case': Field_PrepCase,
'pron_type': Field_PronType,
'punct_side': Field_PunctSide,
'punct_type': Field_PunctType,
'reflex': Field_Reflex,
'style': Field_Style,
'style_variant': Field_StyleVariant,
'tense': Field_Tense,
'typo': Field_Typo,
'verb_form': Field_VerbForm,
'voice': Field_Voice,
'verb_type': Field_VerbType
}
2019-03-07 19:14:57 +03:00
FEATURES = [
"Abbr_yes",
"AdpType_circ",
"AdpType_comprep",
2019-03-07 23:58:43 +03:00
"AdpType_prep",
2019-03-07 19:14:57 +03:00
"AdpType_post",
"AdpType_voc",
"AdvType_adadj,"
"AdvType_cau",
"AdvType_deg",
"AdvType_ex",
"AdvType_loc",
"AdvType_man",
"AdvType_mod",
"AdvType_sta",
"AdvType_tim",
"Animacy_anim",
"Animacy_hum",
"Animacy_inan",
"Animacy_nhum",
"Aspect_freq",
"Aspect_imp",
"Aspect_mod",
"Aspect_none",
"Aspect_perf",
2019-03-08 20:54:25 +03:00
"Aspect_prof",
"Aspect_prosp",
2019-03-07 19:14:57 +03:00
"Case_abe",
"Case_abl",
"Case_abs",
"Case_acc",
"Case_ade",
"Case_all",
"Case_cau",
"Case_com",
"Case_dat",
"Case_del",
"Case_dis",
"Case_ela",
"Case_ess",
"Case_gen",
"Case_ill",
"Case_ine",
"Case_ins",
"Case_loc",
"Case_lat",
"Case_nom",
"Case_par",
"Case_sub",
"Case_sup",
"Case_tem",
"Case_ter",
"Case_tra",
"Case_voc",
"ConjType_comp",
"ConjType_oper",
"Connegative_yes",
"Definite_cons",
"Definite_def",
"Definite_ind",
"Definite_red",
"Definite_two",
"Degree_abs",
"Degree_cmp",
"Degree_comp",
"Degree_none",
"Degree_pos",
"Degree_sup",
"Degree_com",
"Degree_dim",
"Derivation_minen",
"Derivation_sti",
"Derivation_inen",
"Derivation_lainen",
"Derivation_ja",
"Derivation_ton",
"Derivation_vs",
"Derivation_ttain",
"Derivation_ttaa",
"Echo_rdp",
"Echo_ech",
"Foreign_foreign",
"Foreign_fscript",
"Foreign_tscript",
"Foreign_yes",
"Gender_com",
"Gender_fem",
"Gender_masc",
"Gender_neut",
"Gender_dat_masc",
"Gender_dat_fem",
"Gender_erg_masc",
"Gender_erg_fem",
"Gender_psor_masc",
"Gender_psor_fem",
"Gender_psor_neut",
"Hyph_yes",
"InfForm_one",
"InfForm_two",
"InfForm_three",
"Mood_cnd",
"Mood_imp",
"Mood_ind",
"Mood_n",
"Mood_pot",
"Mood_sub",
"Mood_opt",
"NameType_geo",
"NameType_prs",
"NameType_giv",
"NameType_sur",
"NameType_nat",
"NameType_com",
"NameType_pro",
"NameType_oth",
"Negative_neg",
"Negative_pos",
"Negative_yes",
"NounType_com",
"NounType_prop",
"NounType_class",
"Number_com",
"Number_dual",
"Number_none",
"Number_plur",
"Number_sing",
"Number_ptan",
"Number_count",
"Number_abs_sing",
"Number_abs_plur",
"Number_dat_sing",
"Number_dat_plur",
"Number_erg_sing",
"Number_erg_plur",
"Number_psee_sing",
"Number_psee_plur",
"Number_psor_sing",
"Number_psor_plur",
"NumForm_digit",
"NumForm_roman",
"NumForm_word",
"NumType_card",
"NumType_dist",
"NumType_frac",
"NumType_gen",
"NumType_mult",
"NumType_none",
"NumType_ord",
"NumType_sets",
"NumValue_one",
"NumValue_two",
"NumValue_three",
"PartForm_pres",
"PartForm_past",
"PartForm_agt",
"PartForm_neg",
"PartType_mod",
"PartType_emp",
"PartType_res",
"PartType_inf",
"PartType_vbp",
"Person_one",
"Person_two",
"Person_three",
"Person_none",
"Person_abs_one",
"Person_abs_two",
"Person_abs_three",
"Person_dat_one",
"Person_dat_two",
"Person_dat_three",
"Person_erg_one",
"Person_erg_two",
"Person_erg_three",
"Person_psor_one",
"Person_psor_two",
"Person_psor_three",
"Polarity_neg",
"Polarity_pos",
"Polite_inf",
"Polite_pol",
"Polite_abs_inf",
"Polite_abs_pol",
"Polite_erg_inf",
"Polite_erg_pol",
"Polite_dat_inf",
"Polite_dat_pol",
"Poss_yes",
"Prefix_yes",
"PrepCase_npr",
"PrepCase_pre",
"PronType_advPart",
"PronType_art",
"PronType_default",
"PronType_dem",
"PronType_ind",
"PronType_int",
"PronType_neg",
"PronType_prs",
"PronType_rcp",
"PronType_rel",
"PronType_tot",
"PronType_clit",
"PronType_exc",
"PunctSide_ini",
"PunctSide_fin",
"PunctType_peri",
"PunctType_qest",
"PunctType_excl",
"PunctType_quot",
"PunctType_brck",
"PunctType_comm",
"PunctType_colo",
"PunctType_semi",
"PunctType_dash",
"Reflex_yes",
"Style_arch",
"Style_rare",
"Style_poet",
"Style_norm",
"Style_coll",
"Style_vrnc",
"Style_sing",
"Style_expr",
"Style_derg",
"Style_vulg",
"Style_yes",
"StyleVariant_styleShort",
"StyleVariant_styleBound",
"Tense_fut",
"Tense_imp",
"Tense_past",
"Tense_pres",
"Typo_yes",
"VerbForm_fin",
"VerbForm_ger",
"VerbForm_inf",
"VerbForm_none",
"VerbForm_part",
"VerbForm_partFut",
"VerbForm_partPast",
"VerbForm_partPres",
"VerbForm_sup",
"VerbForm_trans",
"VerbForm_conv",
"VerbForm_gdv",
"VerbType_aux",
"VerbType_cop",
"VerbType_mod",
"VerbType_light",
"Voice_act",
"Voice_cau",
"Voice_pass",
"Voice_mid",
"Voice_int",
]
FEATURE_NAMES = {get_string_id(name): name for name in FEATURES}
FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES}
2019-03-09 03:20:29 +03:00
FIELD_SIZES = Counter(FEATURE_FIELDS.values())
for field in FIELD_SIZES:
FIELD_SIZES[field] += 1
2019-03-07 19:14:57 +03:00
for feat_id, name in FEATURE_NAMES.items():
FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name]
2019-03-09 03:20:29 +03:00
# Mapping of feature names to their position in total vector
2019-03-07 19:14:57 +03:00
FEATURE_OFFSETS = {}
2019-03-09 03:20:29 +03:00
# Mapping of field names to their first position in total vector.
2019-03-07 19:14:57 +03:00
FIELD_OFFSETS = {}
_seen_fields = Counter()
for i, feature in enumerate(FEATURES):
field = FEATURE_FIELDS[feature]
2019-03-09 03:20:29 +03:00
# Add 1 for the NIL class, on each field
FEATURE_OFFSETS[feature] = _seen_fields[field] + 1
2019-03-08 20:54:25 +03:00
if _seen_fields[field] == 0:
2019-03-07 19:14:57 +03:00
FIELD_OFFSETS[field] = i
_seen_fields[field] += 1