spaCy/spacy/morphology.pyx

1108 lines
31 KiB
Cython
Raw Normal View History

# cython: infer_types
# coding: utf8
from __future__ import unicode_literals
from libc.string cimport memset
2019-03-07 03:17:19 +03:00
import srsly
2019-03-07 19:14:57 +03:00
from collections import Counter
2019-03-07 23:58:43 +03:00
from .compat import basestring_
2019-03-07 03:17:19 +03:00
from .strings import get_string_id
2018-09-25 16:18:21 +03:00
from . import symbols
from .attrs cimport POS, IS_SPACE
2017-10-27 22:07:59 +03:00
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
2016-01-19 05:36:51 +03:00
from .lexeme cimport Lexeme
from .errors import Errors
2019-08-29 22:17:34 +03:00
from .util import ensure_path
2015-08-28 00:11:51 +03:00
2019-03-07 20:32:36 +03:00
2019-03-07 19:14:57 +03:00
cdef enum univ_field_t:
2019-03-09 14:49:44 +03:00
Field_POS
2019-03-07 19:14:57 +03:00
Field_Abbr
Field_AdpType
Field_AdvType
Field_Animacy
Field_Aspect
Field_Case
Field_ConjType
Field_Connegative
Field_Definite
Field_Degree
Field_Derivation
Field_Echo
Field_Foreign
Field_Gender
Field_Hyph
Field_InfForm
Field_Mood
Field_NameType
Field_Negative
Field_NounType
Field_Number
Field_NumForm
Field_NumType
Field_NumValue
Field_PartForm
Field_PartType
Field_Person
Field_Polarity
2019-03-10 01:54:59 +03:00
Field_Polite
2019-03-07 19:14:57 +03:00
Field_Poss
Field_Prefix
Field_PrepCase
Field_PronType
Field_PunctSide
Field_PunctType
Field_Reflex
Field_Style
Field_StyleVariant
Field_Tense
Field_Typo
Field_VerbForm
Field_VerbType
2019-03-10 01:54:59 +03:00
Field_Voice
2015-08-28 00:11:51 +03:00
2018-09-25 00:57:41 +03:00
def _normalize_props(props):
2017-10-27 22:07:59 +03:00
"""Transform deprecated string keys to correct names."""
out = {}
2018-09-25 16:18:21 +03:00
props = dict(props)
2018-09-26 22:03:57 +03:00
for key in FIELDS:
2018-09-25 16:18:21 +03:00
if key in props:
2019-03-07 23:58:43 +03:00
value = str(props[key]).lower()
2019-08-25 22:57:02 +03:00
# We don't have support for disjunctive int|rel features, so
# just take the first one :(
if "|" in value:
value = value.split("|")[0]
2019-03-07 23:58:43 +03:00
attr = '%s_%s' % (key, value)
2019-03-07 19:14:57 +03:00
if attr in FEATURES:
2018-09-25 16:18:21 +03:00
props.pop(key)
props[attr] = True
for key, value in props.items():
if key == POS:
if hasattr(value, 'upper'):
value = value.upper()
if value in POS_IDS:
value = POS_IDS[value]
out[key] = value
2017-03-25 23:56:41 +03:00
elif isinstance(key, int):
out[key] = value
2019-03-07 23:58:43 +03:00
elif value is True:
out[key] = value
elif key.lower() == 'pos':
out[POS] = POS_IDS[value.upper()]
2019-03-07 23:58:43 +03:00
elif key.lower() != 'morph':
out[key] = value
return out
2019-03-09 22:55:33 +03:00
class MorphologyClassMap(object):
2019-03-10 01:54:59 +03:00
def __init__(self, features):
2019-03-09 22:55:33 +03:00
self.features = tuple(features)
2019-03-10 01:54:59 +03:00
self.fields = []
self.feat2field = {}
seen_fields = set()
for feature in features:
field = feature.split("_", 1)[0]
if field not in seen_fields:
self.fields.append(field)
seen_fields.add(field)
self.feat2field[feature] = FIELDS[field]
2019-03-09 22:55:33 +03:00
self.id2feat = {get_string_id(name): name for name in features}
2019-03-10 01:54:59 +03:00
self.field2feats = {"POS": []}
2019-03-09 22:55:33 +03:00
self.col2info = []
self.attr2field = dict(LOWER_FIELDS.items())
2019-03-10 01:54:59 +03:00
self.feat2offset = {}
self.field2col = {}
self.field2id = dict(FIELDS.items())
self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()}
2019-03-09 22:55:33 +03:00
for feature in features:
2019-03-10 01:54:59 +03:00
field = self.fields[self.feat2field[feature]]
if field not in self.field2col:
self.field2col[field] = len(self.col2info)
if field != "POS" and field not in self.field2feats:
self.col2info.append((field, 0, "NIL"))
self.field2feats.setdefault(field, ["NIL"])
offset = len(self.field2feats[field])
self.field2feats[field].append(feature)
self.col2info.append((field, offset, feature))
self.feat2offset[feature] = offset
2019-03-07 19:14:57 +03:00
2019-03-09 22:55:33 +03:00
@property
def field_sizes(self):
return [len(self.field2feats[field]) for field in self.fields]
2019-03-07 19:14:57 +03:00
2019-03-09 22:55:33 +03:00
def get_field_offset(self, field):
2019-03-10 03:59:51 +03:00
return self.field2col[field]
2018-09-26 00:03:43 +03:00
cdef class Morphology:
2018-09-25 00:57:41 +03:00
'''Store the possible morphological analyses for a language, and index them
by hash.
2018-09-25 00:57:41 +03:00
To save space on each token, tokens only know the hash of their morphological
analysis, so queries of morphological attributes are delegated
to this class.
'''
2017-06-05 00:34:32 +03:00
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool()
self.strings = string_store
2018-09-25 00:57:41 +03:00
self.tags = PreshMap()
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
space_attrs = tag_map.get('SP', {POS: SPACE})
if '_SP' not in tag_map:
self.strings.add('_SP')
tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs
self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_map = {}
self.lemmatizer = lemmatizer
2017-06-03 21:29:23 +03:00
self.n_tags = len(tag_map)
self.reverse_index = {}
2019-03-10 01:54:59 +03:00
self._feat_map = MorphologyClassMap(FEATURES)
2019-08-29 22:17:34 +03:00
self._load_from_tag_map(tag_map)
2017-10-11 11:33:06 +03:00
self._cache = PreshMapArray(self.n_tags)
2017-06-05 00:34:32 +03:00
self.exc = {}
if exc is not None:
2018-09-25 16:18:21 +03:00
for (tag, orth), attrs in exc.items():
2019-03-07 20:32:36 +03:00
attrs = _normalize_props(attrs)
2018-09-25 16:18:21 +03:00
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
2018-09-25 11:57:33 +03:00
2019-08-29 22:17:34 +03:00
def _load_from_tag_map(self, tag_map):
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
self.add({self._feat_map.id2feat[feat] for feat in attrs
if feat in self._feat_map.id2feat})
self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i
2018-09-25 11:57:33 +03:00
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
2018-09-25 00:57:41 +03:00
def add(self, features):
"""Insert a morphological analysis in the morphology table, if not already
present. Returns the hash of the new analysis.
"""
2019-03-07 20:32:36 +03:00
for f in features:
2019-03-07 23:58:43 +03:00
if isinstance(f, basestring_):
self.strings.add(f)
2019-08-29 22:17:34 +03:00
string_features = features
2018-09-25 21:53:24 +03:00
features = intify_features(features)
2019-03-07 19:14:57 +03:00
cdef attr_t feature
2018-09-26 22:03:57 +03:00
for feature in features:
2019-03-09 22:55:33 +03:00
if feature != 0 and feature not in self._feat_map.id2feat:
2019-03-07 20:32:36 +03:00
raise KeyError("Unknown feature: %s" % self.strings[feature])
2019-03-07 16:03:07 +03:00
cdef MorphAnalysisC tag
2018-09-26 22:03:57 +03:00
tag = create_rich_tag(features)
2018-09-25 00:57:41 +03:00
cdef hash_t key = self.insert(tag)
return key
2018-09-26 22:03:57 +03:00
def get(self, hash_t morph):
2019-03-07 16:03:07 +03:00
tag = <MorphAnalysisC*>self.tags.get(morph)
2018-09-26 22:03:57 +03:00
if tag == NULL:
return []
else:
2019-03-08 02:08:35 +03:00
return tag_to_json(tag)
2018-09-25 22:07:08 +03:00
cpdef update(self, hash_t morph, features):
2018-09-25 21:53:24 +03:00
"""Update a morphological analysis with new feature values."""
2019-03-07 16:03:07 +03:00
tag = (<MorphAnalysisC*>self.tags.get(morph))[0]
2018-09-25 22:07:08 +03:00
features = intify_features(features)
2019-03-07 19:14:57 +03:00
cdef attr_t feature
2018-09-25 22:07:08 +03:00
for feature in features:
2019-03-09 22:55:33 +03:00
field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
2019-03-07 19:14:57 +03:00
set_feature(&tag, field, feature, 1)
2018-09-25 22:07:08 +03:00
morph = self.insert(tag)
2018-09-25 21:53:24 +03:00
return morph
2018-09-25 00:57:41 +03:00
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
return orth
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings.add(py_string.lower())
cdef list lemma_strings
cdef unicode lemma_string
# Normalize features into a dict keyed by the field, to make life easier
# for the lemmatizer. Handles string-to-int conversion too.
string_feats = {}
for key, value in morphology.items():
if value is True:
name, value = self.strings.as_string(key).split('_', 1)
string_feats[name] = value
else:
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
2018-09-25 00:57:41 +03:00
lemma_string = lemma_strings[0]
lemma = self.strings.add(lemma_string)
return lemma
2018-09-25 11:57:33 +03:00
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
2018-09-25 16:18:21 +03:00
attrs = dict(attrs)
attrs = _normalize_props(attrs)
2019-03-09 22:55:33 +03:00
self.add({self._feat_map.id2feat[feat] for feat in attrs
if feat in self._feat_map.id2feat})
2018-09-25 16:18:21 +03:00
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
2019-03-07 16:03:07 +03:00
cdef hash_t insert(self, MorphAnalysisC tag) except 0:
2018-09-25 00:57:41 +03:00
cdef hash_t key = hash_tag(tag)
if self.tags.get(key) == NULL:
2019-03-07 16:03:07 +03:00
tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
2018-09-25 00:57:41 +03:00
tag_ptr[0] = tag
self.tags.set(key, <void*>tag_ptr)
return key
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
2017-10-27 22:07:59 +03:00
table provided by the language data as lemma_lookup (if available).
"""
2017-10-11 11:33:06 +03:00
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
Bloom-filter backed Lookup Tables (#4268) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance * Update docstrings * Update docstrings and errors * Update test * Add Lookups.__len__ * Add serialization methods * Add Lookups.remove_table * Use msgpack for serialization to disk * Fix file exists check * Try using OrderedDict for everything * Update .flake8 [ci skip] * Try fixing serialization * Update test_lookups.py * Update test_serialize_vocab_strings.py * Lookups / Tables now work This implements the stubs in the Lookups/Table classes. Currently this is in Cython but with no type declarations, so that could be improved. * Add lookups to setup.py * Actually add lookups pyx The previous commit added the old py file... * Lookups work-in-progress * Move from pyx back to py * Add string based lookups, fix serialization * Update tests, language/lemmatizer to work with string lookups There are some outstanding issues here: - a pickling-related test fails due to the bloom filter - some custom lemmatizers (fr/nl at least) have issues More generally, there's a question of how to deal with the case where you have a string but want to use the lookup table. Currently the table allows access by string or id, but that's getting pretty awkward. * Change lemmatizer lookup method to pass (orth, string) * Fix token lookup * Fix French lookup * Fix lt lemmatizer test * Fix Dutch lemmatizer * Fix lemmatizer lookup test This was using a normal dict instead of a Table, so checks for the string instead of an integer key failed. * Make uk/nl/ru lemmatizer lookup methods consistent The mentioned tokenizers all have their own implementation of the `lookup` method, which accesses a `Lookups` table. The way that was called in `token.pyx` was changed so this should be updated to have the same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id, string)). Prior to this change tests weren't failing, but there would probably be issues with normal use of a model. More tests should proably be added. Additionally, the language-specific `lookup` implementations seem like they might not be needed, since they handle things like lower-casing that aren't actually language specific. * Make recently added Greek method compatible * Remove redundant class/method Leftovers from a merge not cleaned up adequately.
2019-09-12 18:26:11 +03:00
lemma = self.lemmatizer.lookup(token.lex.orth, orth_str)
token.lemma = self.strings.add(lemma)
2018-09-25 11:57:33 +03:00
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
cdef attr_t tag = self.strings.as_int(tag_str)
2017-05-28 15:06:40 +03:00
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
2017-05-28 15:06:40 +03:00
self.assign_tag_id(token, tag_id)
else:
token.tag = tag
2016-11-04 21:19:09 +03:00
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError(Errors.E014.format(tag=tag_id))
# Ensure spaces get tagged as space.
# It seems pretty arbitrary to put this logic here, but there's really
# nowhere better. I guess the justification is that this is where the
# specific word and the tag interact. Still, we should have a better
# way to enforce this rule, or figure out why the statistical model fails.
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
2018-09-25 01:35:59 +03:00
tag_str = self.tag_names[tag_id]
features = dict(self.tag_map.get(tag_str, {}))
2018-09-25 16:18:21 +03:00
if features:
2018-09-25 11:57:33 +03:00
pos = self.strings.as_int(features.pop(POS))
else:
pos = 0
2018-09-25 16:18:21 +03:00
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
if lemma == 0:
2019-03-07 14:14:37 +03:00
# Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
2018-09-25 16:18:21 +03:00
lemma = self.lemmatize(pos, token.lex.orth, features)
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
2018-09-25 00:57:41 +03:00
token.lemma = lemma
2018-09-25 11:57:33 +03:00
token.pos = <univ_pos_t>pos
2018-09-25 00:57:41 +03:00
token.tag = self.strings[tag_str]
2019-03-07 20:32:36 +03:00
token.morph = self.add(features)
2018-09-25 16:18:21 +03:00
if (self.tag_names[tag_id], token.lex.orth) in self.exc:
self._assign_tag_from_exceptions(token, tag_id)
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
key = (self.tag_names[tag_id], token.lex.orth)
cdef dict attrs
attrs = self.exc[key]
token.pos = attrs.get(POS, token.pos)
token.lemma = attrs.get(LEMMA, token.lemma)
2018-09-25 16:18:21 +03:00
def load_morph_exceptions(self, dict exc):
# Map (form, pos) to attributes
2018-09-25 16:18:21 +03:00
for tag_str, entries in exc.items():
for form_str, attrs in entries.items():
self.add_special_case(tag_str, form_str, attrs)
2019-03-09 22:55:33 +03:00
@classmethod
def create_class_map(cls):
2019-03-10 01:54:59 +03:00
return MorphologyClassMap(FEATURES)
2019-03-09 22:55:33 +03:00
2018-09-25 00:57:41 +03:00
cpdef univ_pos_t get_int_tag(pos_):
return <univ_pos_t>0
2018-09-25 21:53:24 +03:00
cpdef intify_features(features):
2019-03-07 19:14:57 +03:00
return {get_string_id(feature) for feature in features}
2018-09-25 00:57:41 +03:00
2019-03-07 16:03:07 +03:00
cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
2018-09-25 00:57:41 +03:00
return mrmr.hash64(&tag, sizeof(tag), 0)
2019-03-07 19:14:57 +03:00
2019-03-07 16:03:07 +03:00
cdef MorphAnalysisC create_rich_tag(features) except *:
cdef MorphAnalysisC tag
2019-03-07 19:14:57 +03:00
cdef attr_t feature
2018-09-25 22:07:08 +03:00
memset(&tag, 0, sizeof(tag))
for feature in features:
2019-03-09 22:55:33 +03:00
field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
2019-03-07 19:14:57 +03:00
set_feature(&tag, field, feature, 1)
2018-09-25 00:57:41 +03:00
return tag
2019-03-07 19:14:57 +03:00
2019-03-08 02:08:35 +03:00
cdef tag_to_json(const MorphAnalysisC* tag):
return [FEATURE_NAMES[f] for f in list_features(tag)]
cdef MorphAnalysisC tag_from_json(json_tag):
raise NotImplementedError
cdef list list_features(const MorphAnalysisC* tag):
output = []
2018-09-26 22:03:57 +03:00
if tag.abbr != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.abbr)
2018-09-26 22:03:57 +03:00
if tag.adp_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.adp_type)
2018-09-26 22:03:57 +03:00
if tag.adv_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.adv_type)
2018-09-26 22:03:57 +03:00
if tag.animacy != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.animacy)
2018-09-26 22:03:57 +03:00
if tag.aspect != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.aspect)
2018-09-26 22:03:57 +03:00
if tag.case != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.case)
2018-09-26 22:03:57 +03:00
if tag.conj_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.conj_type)
2018-09-26 22:03:57 +03:00
if tag.connegative != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.connegative)
2018-09-26 22:03:57 +03:00
if tag.definite != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.definite)
2018-09-26 22:03:57 +03:00
if tag.degree != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.degree)
2018-09-26 22:03:57 +03:00
if tag.derivation != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.derivation)
2018-09-26 22:03:57 +03:00
if tag.echo != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.echo)
2018-09-26 22:03:57 +03:00
if tag.foreign != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.foreign)
2018-09-26 22:03:57 +03:00
if tag.gender != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.gender)
2018-09-26 22:03:57 +03:00
if tag.hyph != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.hyph)
2018-09-26 22:03:57 +03:00
if tag.inf_form != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.inf_form)
2018-09-26 22:03:57 +03:00
if tag.mood != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.mood)
2018-09-26 22:03:57 +03:00
if tag.negative != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.negative)
2018-09-26 22:03:57 +03:00
if tag.number != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.number)
2018-09-26 22:03:57 +03:00
if tag.name_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.name_type)
2018-09-26 22:03:57 +03:00
if tag.noun_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.noun_type)
2018-09-26 22:03:57 +03:00
if tag.part_form != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.part_form)
2018-09-26 22:03:57 +03:00
if tag.part_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.part_type)
2018-09-26 22:03:57 +03:00
if tag.person != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.person)
2018-09-26 22:03:57 +03:00
if tag.polite != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.polite)
2018-09-26 22:03:57 +03:00
if tag.polarity != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.polarity)
2018-09-26 22:03:57 +03:00
if tag.poss != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.poss)
2018-09-26 22:03:57 +03:00
if tag.prefix != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.prefix)
2018-09-26 22:03:57 +03:00
if tag.prep_case != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.prep_case)
2018-09-26 22:03:57 +03:00
if tag.pron_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.pron_type)
2018-09-26 22:03:57 +03:00
if tag.punct_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.punct_type)
2018-09-26 22:03:57 +03:00
if tag.reflex != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.reflex)
2018-09-26 22:03:57 +03:00
if tag.style != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.style)
2018-09-26 22:03:57 +03:00
if tag.style_variant != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.style_variant)
if tag.typo != 0:
output.append(tag.typo)
2018-09-26 22:03:57 +03:00
if tag.verb_form != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.verb_form)
2018-09-26 22:03:57 +03:00
if tag.voice != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.voice)
2018-09-26 22:03:57 +03:00
if tag.verb_type != 0:
2019-03-08 02:08:35 +03:00
output.append(tag.verb_type)
return output
2018-09-25 00:57:41 +03:00
2019-03-07 20:32:36 +03:00
2019-03-08 02:08:35 +03:00
cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
field = <univ_field_t>field_id
2019-03-09 14:49:44 +03:00
if field == Field_POS:
return tag.pos
2019-03-08 02:08:35 +03:00
if field == Field_Abbr:
return tag.abbr
elif field == Field_AdpType:
return tag.adp_type
elif field == Field_AdvType:
return tag.adv_type
elif field == Field_Animacy:
return tag.animacy
elif field == Field_Aspect:
return tag.aspect
elif field == Field_Case:
return tag.case
elif field == Field_ConjType:
return tag.conj_type
elif field == Field_Connegative:
return tag.connegative
elif field == Field_Definite:
return tag.definite
elif field == Field_Degree:
return tag.degree
elif field == Field_Derivation:
return tag.derivation
elif field == Field_Echo:
return tag.echo
elif field == Field_Foreign:
return tag.foreign
elif field == Field_Gender:
return tag.gender
elif field == Field_Hyph:
return tag.hyph
elif field == Field_InfForm:
return tag.inf_form
elif field == Field_Mood:
return tag.mood
elif field == Field_Negative:
return tag.negative
elif field == Field_Number:
return tag.number
elif field == Field_NameType:
return tag.name_type
elif field == Field_NounType:
return tag.noun_type
elif field == Field_NumForm:
return tag.num_form
elif field == Field_NumType:
return tag.num_type
elif field == Field_NumValue:
return tag.num_value
elif field == Field_PartForm:
return tag.part_form
elif field == Field_PartType:
return tag.part_type
elif field == Field_Person:
return tag.person
elif field == Field_Polite:
return tag.polite
elif field == Field_Polarity:
return tag.polarity
elif field == Field_Poss:
return tag.poss
elif field == Field_Prefix:
return tag.prefix
elif field == Field_PrepCase:
return tag.prep_case
elif field == Field_PronType:
return tag.pron_type
elif field == Field_PunctSide:
return tag.punct_side
elif field == Field_PunctType:
return tag.punct_type
elif field == Field_Reflex:
return tag.reflex
elif field == Field_Style:
return tag.style
elif field == Field_StyleVariant:
return tag.style_variant
elif field == Field_Tense:
return tag.tense
elif field == Field_Typo:
return tag.typo
elif field == Field_VerbForm:
return tag.verb_form
elif field == Field_Voice:
return tag.voice
elif field == Field_VerbType:
return tag.verb_type
else:
2019-03-09 22:55:33 +03:00
raise ValueError("Unknown field: (%d)" % field_id)
2019-03-07 20:32:36 +03:00
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
if tag.abbr == feature:
return 1
elif tag.adp_type == feature:
return 1
elif tag.adv_type == feature:
return 1
elif tag.animacy == feature:
return 1
elif tag.aspect == feature:
return 1
elif tag.case == feature:
return 1
elif tag.conj_type == feature:
return 1
elif tag.connegative == feature:
return 1
elif tag.definite == feature:
return 1
elif tag.degree == feature:
return 1
elif tag.derivation == feature:
return 1
elif tag.echo == feature:
return 1
elif tag.foreign == feature:
return 1
elif tag.gender == feature:
return 1
elif tag.hyph == feature:
return 1
elif tag.inf_form == feature:
return 1
elif tag.mood == feature:
return 1
elif tag.negative == feature:
return 1
elif tag.number == feature:
return 1
elif tag.name_type == feature:
return 1
elif tag.noun_type == feature:
return 1
elif tag.num_form == feature:
return 1
elif tag.num_type == feature:
return 1
elif tag.num_value == feature:
return 1
elif tag.part_form == feature:
return 1
elif tag.part_type == feature:
return 1
elif tag.person == feature:
return 1
elif tag.polite == feature:
return 1
elif tag.polarity == feature:
return 1
elif tag.poss == feature:
return 1
elif tag.prefix == feature:
return 1
elif tag.prep_case == feature:
return 1
elif tag.pron_type == feature:
return 1
elif tag.punct_side == feature:
return 1
elif tag.punct_type == feature:
return 1
elif tag.reflex == feature:
return 1
elif tag.style == feature:
return 1
elif tag.style_variant == feature:
return 1
elif tag.tense == feature:
return 1
elif tag.typo == feature:
return 1
elif tag.verb_form == feature:
return 1
elif tag.voice == feature:
return 1
elif tag.verb_type == feature:
return 1
else:
return 0
2019-03-07 19:14:57 +03:00
cdef int set_feature(MorphAnalysisC* tag,
univ_field_t field, attr_t feature, int value) except -1:
2018-09-25 00:57:41 +03:00
if value == True:
value_ = feature
else:
2019-03-07 19:14:57 +03:00
value_ = 0
2019-03-08 02:08:35 +03:00
prev_value = get_field(tag, field)
2019-03-09 14:49:44 +03:00
if prev_value != 0 and value_ == 0 and field != Field_POS:
2019-03-08 02:08:35 +03:00
tag.length -= 1
2019-03-09 14:49:44 +03:00
elif prev_value == 0 and value_ != 0 and field != Field_POS:
2019-03-08 02:08:35 +03:00
tag.length += 1
2019-03-07 19:14:57 +03:00
if feature == 0:
2018-09-25 00:57:41 +03:00
pass
2019-03-09 14:49:44 +03:00
elif field == Field_POS:
tag.pos = get_string_id(FEATURE_NAMES[value_].split('_')[1])
2019-03-07 19:14:57 +03:00
elif field == Field_Abbr:
2018-09-25 00:57:41 +03:00
tag.abbr = value_
2019-03-07 19:14:57 +03:00
elif field == Field_AdpType:
2018-09-25 00:57:41 +03:00
tag.adp_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_AdvType:
2018-09-25 00:57:41 +03:00
tag.adv_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Animacy:
2018-09-25 00:57:41 +03:00
tag.animacy = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Aspect:
2018-09-25 00:57:41 +03:00
tag.aspect = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Case:
2018-09-25 00:57:41 +03:00
tag.case = value_
2019-03-07 19:14:57 +03:00
elif field == Field_ConjType:
2018-09-25 00:57:41 +03:00
tag.conj_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Connegative:
2018-09-25 00:57:41 +03:00
tag.connegative = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Definite:
2018-09-25 00:57:41 +03:00
tag.definite = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Degree:
2018-09-25 00:57:41 +03:00
tag.degree = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Derivation:
2018-09-25 00:57:41 +03:00
tag.derivation = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Echo:
2018-09-25 00:57:41 +03:00
tag.echo = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Foreign:
2018-09-25 00:57:41 +03:00
tag.foreign = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Gender:
2018-09-25 00:57:41 +03:00
tag.gender = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Hyph:
2018-09-25 00:57:41 +03:00
tag.hyph = value_
2019-03-07 19:14:57 +03:00
elif field == Field_InfForm:
2018-09-25 00:57:41 +03:00
tag.inf_form = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Mood:
2018-09-25 00:57:41 +03:00
tag.mood = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Negative:
2018-09-25 00:57:41 +03:00
tag.negative = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Number:
2018-09-25 00:57:41 +03:00
tag.number = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NameType:
2018-09-25 00:57:41 +03:00
tag.name_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NounType:
2018-09-26 22:03:57 +03:00
tag.noun_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NumForm:
2018-09-25 00:57:41 +03:00
tag.num_form = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NumType:
2018-09-26 22:03:57 +03:00
tag.num_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_NumValue:
2018-09-25 00:57:41 +03:00
tag.num_value = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PartForm:
2018-09-25 00:57:41 +03:00
tag.part_form = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PartType:
2018-09-25 00:57:41 +03:00
tag.part_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Person:
2018-09-25 00:57:41 +03:00
tag.person = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Polite:
2018-09-25 00:57:41 +03:00
tag.polite = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Polarity:
2018-09-25 00:57:41 +03:00
tag.polarity = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Poss:
2018-09-25 00:57:41 +03:00
tag.poss = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Prefix:
2018-09-25 00:57:41 +03:00
tag.prefix = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PrepCase:
2018-09-25 00:57:41 +03:00
tag.prep_case = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PronType:
2018-09-25 00:57:41 +03:00
tag.pron_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PunctSide:
2018-09-26 22:03:57 +03:00
tag.punct_side = value_
2019-03-07 19:14:57 +03:00
elif field == Field_PunctType:
2018-09-25 00:57:41 +03:00
tag.punct_type = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Reflex:
2018-09-25 00:57:41 +03:00
tag.reflex = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Style:
2018-09-25 00:57:41 +03:00
tag.style = value_
2019-03-07 19:14:57 +03:00
elif field == Field_StyleVariant:
2018-09-25 00:57:41 +03:00
tag.style_variant = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Tense:
2018-09-25 00:57:41 +03:00
tag.tense = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Typo:
2018-09-26 22:03:57 +03:00
tag.typo = value_
2019-03-07 19:14:57 +03:00
elif field == Field_VerbForm:
2018-09-25 00:57:41 +03:00
tag.verb_form = value_
2019-03-07 19:14:57 +03:00
elif field == Field_Voice:
2018-09-25 00:57:41 +03:00
tag.voice = value_
2019-03-07 19:14:57 +03:00
elif field == Field_VerbType:
2018-09-25 00:57:41 +03:00
tag.verb_type = value_
else:
2019-03-07 19:14:57 +03:00
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
2016-12-18 17:48:00 +03:00
FIELDS = {
2019-03-09 14:49:44 +03:00
'POS': Field_POS,
2019-03-07 19:14:57 +03:00
'Abbr': Field_Abbr,
'AdpType': Field_AdpType,
'AdvType': Field_AdvType,
'Animacy': Field_Animacy,
'Aspect': Field_Aspect,
'Case': Field_Case,
'ConjType': Field_ConjType,
'Connegative': Field_Connegative,
'Definite': Field_Definite,
'Degree': Field_Degree,
'Derivation': Field_Derivation,
'Echo': Field_Echo,
'Foreign': Field_Foreign,
'Gender': Field_Gender,
'Hyph': Field_Hyph,
'InfForm': Field_InfForm,
'Mood': Field_Mood,
'NameType': Field_NameType,
'Negative': Field_Negative,
'NounType': Field_NounType,
'Number': Field_Number,
'NumForm': Field_NumForm,
'NumType': Field_NumType,
'NumValue': Field_NumValue,
'PartForm': Field_PartForm,
'PartType': Field_PartType,
'Person': Field_Person,
'Polite': Field_Polite,
'Polarity': Field_Polarity,
'Poss': Field_Poss,
'Prefix': Field_Prefix,
'PrepCase': Field_PrepCase,
'PronType': Field_PronType,
'PunctSide': Field_PunctSide,
'PunctType': Field_PunctType,
'Reflex': Field_Reflex,
'Style': Field_Style,
'StyleVariant': Field_StyleVariant,
'Tense': Field_Tense,
'Typo': Field_Typo,
'VerbForm': Field_VerbForm,
2019-03-10 01:54:59 +03:00
'VerbType': Field_VerbType,
2019-03-07 19:14:57 +03:00
'Voice': Field_Voice,
2018-09-26 00:03:43 +03:00
}
LOWER_FIELDS = {
2019-03-09 14:49:44 +03:00
'pos': Field_POS,
'abbr': Field_Abbr,
'adp_type': Field_AdpType,
'adv_type': Field_AdvType,
'animacy': Field_Animacy,
'aspect': Field_Aspect,
'case': Field_Case,
'conj_type': Field_ConjType,
'connegative': Field_Connegative,
'definite': Field_Definite,
'degree': Field_Degree,
'derivation': Field_Derivation,
'echo': Field_Echo,
'foreign': Field_Foreign,
'gender': Field_Gender,
'hyph': Field_Hyph,
'inf_form': Field_InfForm,
'mood': Field_Mood,
'name_type': Field_NameType,
'negative': Field_Negative,
'noun_type': Field_NounType,
'number': Field_Number,
'num_form': Field_NumForm,
'num_type': Field_NumType,
'num_value': Field_NumValue,
'part_form': Field_PartForm,
'part_type': Field_PartType,
'person': Field_Person,
'polarity': Field_Polarity,
2019-03-10 01:54:59 +03:00
'polite': Field_Polite,
'poss': Field_Poss,
'prefix': Field_Prefix,
'prep_case': Field_PrepCase,
'pron_type': Field_PronType,
'punct_side': Field_PunctSide,
'punct_type': Field_PunctType,
'reflex': Field_Reflex,
'style': Field_Style,
'style_variant': Field_StyleVariant,
'tense': Field_Tense,
'typo': Field_Typo,
'verb_form': Field_VerbForm,
2019-03-10 01:54:59 +03:00
'verb_type': Field_VerbType,
'voice': Field_Voice,
}
2019-03-07 19:14:57 +03:00
FEATURES = [
2019-03-09 14:49:44 +03:00
"POS_ADJ",
"POS_ADP",
"POS_ADV",
"POS_AUX",
"POS_CONJ",
"POS_CCONJ",
"POS_DET",
"POS_INTJ",
"POS_NOUN",
"POS_NUM",
"POS_PART",
"POS_PRON",
"POS_PROPN",
"POS_PUNCT",
"POS_SCONJ",
"POS_SYM",
"POS_VERB",
"POS_X",
"POS_EOL",
"POS_SPACE",
2019-03-07 19:14:57 +03:00
"Abbr_yes",
"AdpType_circ",
"AdpType_comprep",
2019-03-07 23:58:43 +03:00
"AdpType_prep",
2019-03-07 19:14:57 +03:00
"AdpType_post",
"AdpType_voc",
2019-03-10 01:54:59 +03:00
"AdvType_adadj",
2019-03-07 19:14:57 +03:00
"AdvType_cau",
"AdvType_deg",
"AdvType_ex",
"AdvType_loc",
"AdvType_man",
"AdvType_mod",
"AdvType_sta",
"AdvType_tim",
"Animacy_anim",
"Animacy_hum",
"Animacy_inan",
"Animacy_nhum",
2019-09-11 16:16:53 +03:00
"Aspect_hab",
2019-03-07 19:14:57 +03:00
"Aspect_imp",
2019-09-11 16:16:53 +03:00
"Aspect_iter",
2019-03-07 19:14:57 +03:00
"Aspect_perf",
"Aspect_prog",
2019-03-08 20:54:25 +03:00
"Aspect_prosp",
2019-09-11 16:16:53 +03:00
"Aspect_none",
2019-03-07 19:14:57 +03:00
"Case_abe",
"Case_abl",
"Case_abs",
"Case_acc",
"Case_ade",
"Case_all",
"Case_cau",
"Case_com",
"Case_dat",
"Case_del",
"Case_dis",
"Case_ela",
"Case_ess",
"Case_gen",
"Case_ill",
"Case_ine",
"Case_ins",
"Case_loc",
"Case_lat",
"Case_nom",
"Case_par",
"Case_sub",
"Case_sup",
"Case_tem",
"Case_ter",
"Case_tra",
"Case_voc",
"ConjType_comp",
"ConjType_oper",
"Connegative_yes",
"Definite_cons",
"Definite_def",
"Definite_ind",
"Definite_red",
"Definite_two",
"Degree_abs",
"Degree_cmp",
"Degree_comp",
"Degree_none",
"Degree_pos",
"Degree_sup",
"Degree_com",
"Degree_dim",
"Derivation_minen",
"Derivation_sti",
"Derivation_inen",
"Derivation_lainen",
"Derivation_ja",
"Derivation_ton",
"Derivation_vs",
"Derivation_ttain",
"Derivation_ttaa",
"Echo_rdp",
"Echo_ech",
"Foreign_foreign",
"Foreign_fscript",
"Foreign_tscript",
"Foreign_yes",
"Gender_com",
"Gender_fem",
"Gender_masc",
"Gender_neut",
"Gender_dat_masc",
"Gender_dat_fem",
"Gender_erg_masc",
"Gender_erg_fem",
"Gender_psor_masc",
"Gender_psor_fem",
"Gender_psor_neut",
"Hyph_yes",
"InfForm_one",
"InfForm_two",
"InfForm_three",
"Mood_cnd",
"Mood_imp",
"Mood_ind",
"Mood_n",
"Mood_pot",
"Mood_sub",
"Mood_opt",
"NameType_geo",
"NameType_prs",
"NameType_giv",
"NameType_sur",
"NameType_nat",
"NameType_com",
"NameType_pro",
"NameType_oth",
"Negative_neg",
"Negative_pos",
"Negative_yes",
"NounType_com",
"NounType_prop",
"NounType_class",
"Number_com",
"Number_dual",
"Number_none",
"Number_plur",
"Number_sing",
"Number_ptan",
"Number_count",
"Number_abs_sing",
"Number_abs_plur",
"Number_dat_sing",
"Number_dat_plur",
"Number_erg_sing",
"Number_erg_plur",
"Number_psee_sing",
"Number_psee_plur",
"Number_psor_sing",
"Number_psor_plur",
"NumForm_digit",
"NumForm_roman",
"NumForm_word",
2019-09-11 19:06:43 +03:00
"NumForm_combi",
2019-03-07 19:14:57 +03:00
"NumType_card",
"NumType_dist",
"NumType_frac",
"NumType_gen",
"NumType_mult",
"NumType_none",
"NumType_ord",
"NumType_sets",
2019-09-11 19:06:43 +03:00
"NumType_dual",
2019-03-07 19:14:57 +03:00
"NumValue_one",
"NumValue_two",
"NumValue_three",
"PartForm_pres",
"PartForm_past",
"PartForm_agt",
"PartForm_neg",
"PartType_mod",
"PartType_emp",
"PartType_res",
"PartType_inf",
"PartType_vbp",
"Person_one",
"Person_two",
"Person_three",
"Person_none",
"Person_abs_one",
"Person_abs_two",
"Person_abs_three",
"Person_dat_one",
"Person_dat_two",
"Person_dat_three",
"Person_erg_one",
"Person_erg_two",
"Person_erg_three",
"Person_psor_one",
"Person_psor_two",
"Person_psor_three",
"Polarity_neg",
"Polarity_pos",
"Polite_inf",
"Polite_pol",
"Polite_abs_inf",
"Polite_abs_pol",
"Polite_erg_inf",
"Polite_erg_pol",
"Polite_dat_inf",
"Polite_dat_pol",
"Poss_yes",
"Prefix_yes",
"PrepCase_npr",
"PrepCase_pre",
"PronType_advPart",
"PronType_art",
"PronType_default",
"PronType_dem",
"PronType_ind",
"PronType_int",
"PronType_neg",
"PronType_prs",
"PronType_rcp",
"PronType_rel",
"PronType_tot",
"PronType_clit",
"PronType_exc",
"PunctSide_ini",
"PunctSide_fin",
"PunctType_peri",
"PunctType_qest",
"PunctType_excl",
"PunctType_quot",
"PunctType_brck",
"PunctType_comm",
"PunctType_colo",
"PunctType_semi",
"PunctType_dash",
"Reflex_yes",
"Style_arch",
"Style_rare",
"Style_poet",
"Style_norm",
"Style_coll",
"Style_vrnc",
"Style_sing",
"Style_expr",
"Style_derg",
"Style_vulg",
"Style_yes",
"StyleVariant_styleShort",
"StyleVariant_styleBound",
"Tense_fut",
"Tense_imp",
"Tense_past",
"Tense_pres",
"Typo_yes",
"VerbForm_fin",
"VerbForm_ger",
"VerbForm_inf",
"VerbForm_none",
"VerbForm_part",
"VerbForm_partFut",
"VerbForm_partPast",
"VerbForm_partPres",
"VerbForm_sup",
"VerbForm_trans",
"VerbForm_conv",
"VerbForm_gdv",
"VerbType_aux",
"VerbType_cop",
"VerbType_mod",
"VerbType_light",
"Voice_act",
"Voice_cau",
"Voice_pass",
"Voice_mid",
"Voice_int",
]
2019-03-09 22:55:33 +03:00
FEATURE_NAMES = {get_string_id(f): f for f in FEATURES}
FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES}