mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Fix exception loading
This commit is contained in:
parent
e4d8f86d7f
commit
8308c1525e
|
@ -60,13 +60,13 @@ class Lemmatizer(object):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||||
return True
|
return True
|
||||||
elif VerbForm_inf in morphology:
|
elif VerbForm_inf in morphology or 'VerbForm_inf' in morphology:
|
||||||
return True
|
return True
|
||||||
elif VerbForm_none in morphology:
|
elif VerbForm_none in morphology or 'VerbForm_none' in morphology:
|
||||||
return True
|
return True
|
||||||
elif Number_sing in morphology:
|
elif Number_sing in morphology or 'Number_sing' in morphology:
|
||||||
return True
|
return True
|
||||||
elif Degree_pos in morphology:
|
elif Degree_pos in morphology or 'Degree_pos' in morphology:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -30,6 +30,8 @@ cdef class Morphology:
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
||||||
cdef update_morph(self, hash_t morph, features)
|
cdef update_morph(self, hash_t morph, features)
|
||||||
|
|
||||||
|
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
|
||||||
|
|
||||||
cdef enum univ_morph_t:
|
cdef enum univ_morph_t:
|
||||||
NIL = 0
|
NIL = 0
|
||||||
Animacy_anim = symbols.Animacy_anim
|
Animacy_anim = symbols.Animacy_anim
|
||||||
|
|
|
@ -5,6 +5,7 @@ from __future__ import unicode_literals
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
import ujson as json
|
import ujson as json
|
||||||
|
|
||||||
|
from . import symbols
|
||||||
from .attrs cimport POS, IS_SPACE
|
from .attrs cimport POS, IS_SPACE
|
||||||
from .attrs import LEMMA, intify_attrs
|
from .attrs import LEMMA, intify_attrs
|
||||||
from .parts_of_speech cimport SPACE
|
from .parts_of_speech cimport SPACE
|
||||||
|
@ -17,6 +18,24 @@ from .errors import Errors
|
||||||
def _normalize_props(props):
|
def _normalize_props(props):
|
||||||
"""Transform deprecated string keys to correct names."""
|
"""Transform deprecated string keys to correct names."""
|
||||||
out = {}
|
out = {}
|
||||||
|
morph_keys = [
|
||||||
|
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
|
||||||
|
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
||||||
|
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
||||||
|
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
|
||||||
|
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
|
||||||
|
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
||||||
|
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
||||||
|
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
||||||
|
'Polarity', 'PrepCase', 'Animacy' # U20
|
||||||
|
]
|
||||||
|
props = dict(props)
|
||||||
|
for key in morph_keys:
|
||||||
|
if key in props:
|
||||||
|
attr = '%s_%s' % (key, props[key])
|
||||||
|
if attr in IDS:
|
||||||
|
props.pop(key)
|
||||||
|
props[attr] = True
|
||||||
for key, value in props.items():
|
for key, value in props.items():
|
||||||
if key == POS:
|
if key == POS:
|
||||||
if hasattr(value, 'upper'):
|
if hasattr(value, 'upper'):
|
||||||
|
@ -58,15 +77,16 @@ cdef class Morphology:
|
||||||
self.n_tags = len(tag_map)
|
self.n_tags = len(tag_map)
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
print(tag_str, attrs)
|
attrs = _normalize_props(attrs)
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
self.reverse_index[self.strings.add(tag_str)] = i
|
self.reverse_index[self.strings.add(tag_str)] = i
|
||||||
|
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
self.exc = {}
|
self.exc = {}
|
||||||
if exc is not None:
|
if exc is not None:
|
||||||
for (tag_str, orth_str), attrs in exc.items():
|
for (tag, orth), attrs in exc.items():
|
||||||
self.add_special_case(tag_str, orth_str, attrs)
|
self.add_special_case(
|
||||||
|
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||||
|
@ -102,37 +122,10 @@ cdef class Morphology:
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
"""
|
"""
|
||||||
pass
|
attrs = dict(attrs)
|
||||||
## TODO: Currently we've assumed that we know the number of tags --
|
attrs = _normalize_props(attrs)
|
||||||
## RichTagC is an array, and _cache is a PreshMapArray
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
## This is really bad: it makes the morphology typed to the tagger
|
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
|
||||||
## classes, which is all wrong.
|
|
||||||
#self.exc[(tag_str, orth_str)] = dict(attrs)
|
|
||||||
#tag = self.strings.add(tag_str)
|
|
||||||
#if tag not in self.reverse_index:
|
|
||||||
# return
|
|
||||||
#tag_id = self.reverse_index[tag]
|
|
||||||
#orth = self.strings[orth_str]
|
|
||||||
#cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
|
||||||
#attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
|
||||||
#cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
|
|
||||||
#if cached is NULL:
|
|
||||||
# cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
|
||||||
#elif force:
|
|
||||||
# memset(cached, 0, sizeof(cached[0]))
|
|
||||||
#else:
|
|
||||||
# raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))
|
|
||||||
|
|
||||||
#cached.tag = rich_tag
|
|
||||||
## TODO: Refactor this to take arbitrary attributes.
|
|
||||||
#for name_id, value_id in attrs.items():
|
|
||||||
# if name_id == LEMMA:
|
|
||||||
# cached.lemma = value_id
|
|
||||||
# else:
|
|
||||||
# self.assign_feature(&cached.tag.morph, name_id, value_id)
|
|
||||||
#if cached.lemma == 0:
|
|
||||||
# cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
|
|
||||||
#self._cache.set(tag_id, orth, <void*>cached)
|
|
||||||
|
|
||||||
cdef hash_t insert(self, RichTagC tag) except 0:
|
cdef hash_t insert(self, RichTagC tag) except 0:
|
||||||
cdef hash_t key = hash_tag(tag)
|
cdef hash_t key = hash_tag(tag)
|
||||||
|
@ -171,17 +164,27 @@ cdef class Morphology:
|
||||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||||
tag_str = self.tag_names[tag_id]
|
tag_str = self.tag_names[tag_id]
|
||||||
features = dict(self.tag_map.get(tag_str, {}))
|
features = dict(self.tag_map.get(tag_str, {}))
|
||||||
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
|
if features:
|
||||||
if lemma == 0 and features:
|
|
||||||
pos = self.strings.as_int(features.pop(POS))
|
pos = self.strings.as_int(features.pop(POS))
|
||||||
lemma = self.lemmatize(pos, token.lex.orth, features)
|
|
||||||
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
|
|
||||||
else:
|
else:
|
||||||
pos = 0
|
pos = 0
|
||||||
|
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
|
||||||
|
if lemma == 0:
|
||||||
|
lemma = self.lemmatize(pos, token.lex.orth, features)
|
||||||
|
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
|
||||||
token.lemma = lemma
|
token.lemma = lemma
|
||||||
token.pos = <univ_pos_t>pos
|
token.pos = <univ_pos_t>pos
|
||||||
token.tag = self.strings[tag_str]
|
token.tag = self.strings[tag_str]
|
||||||
token.morph = self.add(features)
|
token.morph = self.add(features)
|
||||||
|
if (self.tag_names[tag_id], token.lex.orth) in self.exc:
|
||||||
|
self._assign_tag_from_exceptions(token, tag_id)
|
||||||
|
|
||||||
|
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
|
||||||
|
key = (self.tag_names[tag_id], token.lex.orth)
|
||||||
|
cdef dict attrs
|
||||||
|
attrs = self.exc[key]
|
||||||
|
token.pos = attrs.get(POS, token.pos)
|
||||||
|
token.lemma = attrs.get(LEMMA, token.lemma)
|
||||||
|
|
||||||
cdef update_morph(self, hash_t morph, features):
|
cdef update_morph(self, hash_t morph, features):
|
||||||
"""Update a morphological analysis with new feature values."""
|
"""Update a morphological analysis with new feature values."""
|
||||||
|
@ -194,6 +197,12 @@ cdef class Morphology:
|
||||||
morph = self.insert_tag(tag)
|
morph = self.insert_tag(tag)
|
||||||
return morph
|
return morph
|
||||||
|
|
||||||
|
def load_morph_exceptions(self, dict exc):
|
||||||
|
# Map (form, pos) to (lemma, rich tag)
|
||||||
|
for tag_str, entries in exc.items():
|
||||||
|
for form_str, attrs in entries.items():
|
||||||
|
self.add_special_case(tag_str, form_str, attrs)
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
json_tags = []
|
json_tags = []
|
||||||
for key in self.tags:
|
for key in self.tags:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user