mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-03 02:17:00 +03:00
Wire up morphological features
This commit is contained in:
parent
9b406181cd
commit
9c32388235
|
@ -143,8 +143,10 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
for name, value in stringy_attrs.items():
|
for name, value in stringy_attrs.items():
|
||||||
if isinstance(name, int):
|
if isinstance(name, int):
|
||||||
int_key = name
|
int_key = name
|
||||||
else:
|
elif name.upper() in IDS:
|
||||||
int_key = IDS[name.upper()]
|
int_key = IDS[name.upper()]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
if strings_map is not None and isinstance(value, basestring):
|
if strings_map is not None and isinstance(value, basestring):
|
||||||
if hasattr(strings_map, 'add'):
|
if hasattr(strings_map, 'add'):
|
||||||
value = strings_map.add(value)
|
value = strings_map.add(value)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||||
|
from ...morphology import Fused_begin, Fused_inside
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
|
@ -47,7 +48,11 @@ _exc = {
|
||||||
|
|
||||||
"über'm": [
|
"über'm": [
|
||||||
{ORTH: "über", LEMMA: "über"},
|
{ORTH: "über", LEMMA: "über"},
|
||||||
{ORTH: "'m", LEMMA: "der", NORM: "dem"}]
|
{ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
||||||
|
"zum": [
|
||||||
|
{ORTH: "zu", LEMMA: "zu", "morphology": [Fused_begin]},
|
||||||
|
{ORTH: "m", LEMMA: "der", "morphology": [Fused_inside]}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@ cdef class Morphology:
|
||||||
cdef public object reverse_index
|
cdef public object reverse_index
|
||||||
cdef public object tag_names
|
cdef public object tag_names
|
||||||
cdef public object exc
|
cdef public object exc
|
||||||
|
cdef public object _morph2features
|
||||||
|
|
||||||
cdef RichTagC* rich_tags
|
cdef RichTagC* rich_tags
|
||||||
cdef PreshMapArray _cache
|
cdef PreshMapArray _cache
|
||||||
|
@ -43,6 +44,8 @@ cdef class Morphology:
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
||||||
|
|
||||||
|
cdef int set_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef enum univ_morph_t:
|
cdef enum univ_morph_t:
|
||||||
NIL = 0
|
NIL = 0
|
||||||
|
@ -298,4 +301,7 @@ cdef enum univ_morph_t:
|
||||||
VerbType_mod # U
|
VerbType_mod # U
|
||||||
VerbType_light # U
|
VerbType_light # U
|
||||||
|
|
||||||
|
Fused_begin
|
||||||
|
Fused_inside
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
|
||||||
from .parts_of_speech cimport SPACE
|
from .parts_of_speech cimport SPACE
|
||||||
from .parts_of_speech import IDS as POS_IDS
|
from .parts_of_speech import IDS as POS_IDS
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
|
from .strings cimport hash_string
|
||||||
|
|
||||||
|
|
||||||
def _normalize_props(props):
|
def _normalize_props(props):
|
||||||
|
@ -29,6 +30,11 @@ def _normalize_props(props):
|
||||||
out[key] = value
|
out[key] = value
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
cdef uint64_t hash_features(features):
|
||||||
|
# TODO improve this
|
||||||
|
cdef unicode string = str(tuple(features))
|
||||||
|
return hash_string(string)
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
||||||
|
@ -36,7 +42,7 @@ cdef class Morphology:
|
||||||
self.strings = string_store
|
self.strings = string_store
|
||||||
# Add special space symbol. We prefix with underscore, to make sure it
|
# Add special space symbol. We prefix with underscore, to make sure it
|
||||||
# always sorts to the end.
|
# always sorts to the end.
|
||||||
space_attrs = tag_map.get('SP', {POS: SPACE})
|
space_attrs = tag_map.get('_SP', tag_map.get('SP', {POS: SPACE}))
|
||||||
if '_SP' not in tag_map:
|
if '_SP' not in tag_map:
|
||||||
self.strings.add('_SP')
|
self.strings.add('_SP')
|
||||||
tag_map = dict(tag_map)
|
tag_map = dict(tag_map)
|
||||||
|
@ -48,16 +54,19 @@ cdef class Morphology:
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
|
|
||||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
||||||
|
self._morph2features = {}
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
|
features = attrs.get('morphology', frozenset())
|
||||||
self.strings.add(tag_str)
|
self.strings.add(tag_str)
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
self.rich_tags[i].id = i
|
self.rich_tags[i].id = i
|
||||||
self.rich_tags[i].name = self.strings.add(tag_str)
|
self.rich_tags[i].name = self.strings.add(tag_str)
|
||||||
self.rich_tags[i].morph = 0
|
self.rich_tags[i].morph = hash_features(features)
|
||||||
self.rich_tags[i].pos = attrs[POS]
|
self.rich_tags[i].pos = attrs[POS]
|
||||||
self.reverse_index[self.rich_tags[i].name] = i
|
self.reverse_index[self.rich_tags[i].name] = i
|
||||||
|
self._morph2features[self.rich_tags[i].morph] = features
|
||||||
# Add a 'null' tag, which we can reference when assign morphology to
|
# Add a 'null' tag, which we can reference when assign morphology to
|
||||||
# untagged tokens.
|
# untagged tokens.
|
||||||
self.rich_tags[self.n_tags].id = self.n_tags
|
self.rich_tags[self.n_tags].id = self.n_tags
|
||||||
|
@ -114,12 +123,30 @@ cdef class Morphology:
|
||||||
token.tag = analysis.tag.name
|
token.tag = analysis.tag.name
|
||||||
token.morph = analysis.tag.morph
|
token.morph = analysis.tag.morph
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
|
cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
|
||||||
cdef flags_t one = 1
|
# Deprecated
|
||||||
|
pass
|
||||||
|
|
||||||
|
cdef int set_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
|
||||||
|
'''Update a morph attribute in-place, so that it indicates the given
|
||||||
|
feature.
|
||||||
|
'''
|
||||||
|
features = self._morph2features.get(morph[0], {})
|
||||||
|
cdef uint64_t key
|
||||||
|
cdef attr_t flag = flag_id
|
||||||
|
if (flag in features) != value:
|
||||||
|
new_features = set(features)
|
||||||
if value:
|
if value:
|
||||||
flags[0] |= one << flag_id
|
new_features.add(flag)
|
||||||
else:
|
else:
|
||||||
flags[0] &= ~(one << flag_id)
|
new_features.remove(flag)
|
||||||
|
new_features = frozenset(new_features)
|
||||||
|
key = hash_features(new_features)
|
||||||
|
morph[0] = key
|
||||||
|
self._morph2features[morph[0]] = new_features
|
||||||
|
|
||||||
|
def get_features(self, uint64_t morph):
|
||||||
|
return self._morph2features.get(morph, frozenset())
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||||
force=False):
|
force=False):
|
||||||
|
@ -140,6 +167,9 @@ cdef class Morphology:
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
orth = self.strings[orth_str]
|
orth = self.strings[orth_str]
|
||||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||||
|
features = attrs.get('morphology', frozenset())
|
||||||
|
cdef uint64_t morph = hash_features(features)
|
||||||
|
self._morph2features[morph] = features
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
|
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
|
||||||
if cached is NULL:
|
if cached is NULL:
|
||||||
|
@ -152,12 +182,11 @@ cdef class Morphology:
|
||||||
"force=True to overwrite." % (tag_str, orth_str))
|
"force=True to overwrite." % (tag_str, orth_str))
|
||||||
|
|
||||||
cached.tag = rich_tag
|
cached.tag = rich_tag
|
||||||
|
cached.tag.morph = morph
|
||||||
# TODO: Refactor this to take arbitrary attributes.
|
# TODO: Refactor this to take arbitrary attributes.
|
||||||
for name_id, value_id in attrs.items():
|
for name_id, value_id in attrs.items():
|
||||||
if name_id == LEMMA:
|
if name_id == LEMMA:
|
||||||
cached.lemma = value_id
|
cached.lemma = value_id
|
||||||
else:
|
|
||||||
self.assign_feature(&cached.tag.morph, name_id, value_id)
|
|
||||||
if cached.lemma == 0:
|
if cached.lemma == 0:
|
||||||
cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
|
cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
|
||||||
self._cache.set(tag_id, orth, <void*>cached)
|
self._cache.set(tag_id, orth, <void*>cached)
|
||||||
|
@ -434,6 +463,9 @@ IDS = {
|
||||||
"VerbType_cop": VerbType_cop, # U,
|
"VerbType_cop": VerbType_cop, # U,
|
||||||
"VerbType_mod": VerbType_mod, # U,
|
"VerbType_mod": VerbType_mod, # U,
|
||||||
"VerbType_light": VerbType_light, # U,
|
"VerbType_light": VerbType_light, # U,
|
||||||
|
|
||||||
|
"Fused_begin": Fused_begin, # Internal
|
||||||
|
"Fused_inside": Fused_inside # Internal
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -385,6 +385,9 @@ cdef enum symbol_t:
|
||||||
VerbType_mod # U
|
VerbType_mod # U
|
||||||
VerbType_light # U
|
VerbType_light # U
|
||||||
|
|
||||||
|
Fused_begin
|
||||||
|
Fused_inside
|
||||||
|
|
||||||
PERSON
|
PERSON
|
||||||
NORP
|
NORP
|
||||||
FACILITY
|
FACILITY
|
||||||
|
|
|
@ -390,6 +390,9 @@ IDS = {
|
||||||
"VerbType_mod": VerbType_mod, # U,
|
"VerbType_mod": VerbType_mod, # U,
|
||||||
"VerbType_light": VerbType_light, # U,
|
"VerbType_light": VerbType_light, # U,
|
||||||
|
|
||||||
|
"Fused_begin": Fused_begin,
|
||||||
|
"Fused_inside": Fused_inside,
|
||||||
|
|
||||||
"PERSON": PERSON,
|
"PERSON": PERSON,
|
||||||
"NORP": NORP,
|
"NORP": NORP,
|
||||||
"FACILITY": FACILITY,
|
"FACILITY": FACILITY,
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from ....morphology import Fused_begin, Fused_inside
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_long_text(de_tokenizer):
|
def test_tokenizer_handles_long_text(de_tokenizer):
|
||||||
|
@ -22,9 +23,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
|
||||||
»Was ist mit mir geschehen?«, dachte er."""
|
»Was ist mit mir geschehen?«, dachte er."""
|
||||||
|
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 109
|
assert len(tokens) == 110
|
||||||
|
|
||||||
|
|
||||||
|
def test_fused(de_tokenizer):
|
||||||
|
doc = de_tokenizer('zum')
|
||||||
|
assert len(doc) == 2
|
||||||
|
assert doc[0].check_morph(Fused_begin)
|
||||||
|
assert doc[1].check_morph(Fused_inside)
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', [
|
@pytest.mark.parametrize('text', [
|
||||||
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
|
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
|
||||||
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
|
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
|
||||||
|
|
|
@ -10,6 +10,7 @@ cimport numpy as np
|
||||||
np.import_array()
|
np.import_array()
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
from ..morphology cimport univ_morph_t
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .. import parts_of_speech
|
from .. import parts_of_speech
|
||||||
|
@ -128,6 +129,15 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||||
|
|
||||||
|
def set_morph(self, univ_morph_t feature, bint value):
|
||||||
|
'''Set a morphological feature'''
|
||||||
|
self.vocab.morphology.set_feature(&self.c.morph, feature, value)
|
||||||
|
|
||||||
|
def check_morph(self, univ_morph_t feature):
|
||||||
|
'''Check whether the token has the given morphological feature.'''
|
||||||
|
features = self.vocab.morphology.get_features(self.c.morph)
|
||||||
|
return feature in features
|
||||||
|
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
"""Get a neighboring token.
|
"""Get a neighboring token.
|
||||||
|
|
||||||
|
|
|
@ -232,14 +232,17 @@ cdef class Vocab:
|
||||||
cdef int i
|
cdef int i
|
||||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||||
for i, props in enumerate(substrings):
|
for i, props in enumerate(substrings):
|
||||||
|
features = props.get('morphology', frozenset())
|
||||||
props = intify_attrs(props, strings_map=self.strings,
|
props = intify_attrs(props, strings_map=self.strings,
|
||||||
_do_deprecated=True)
|
_do_deprecated=False)
|
||||||
token = &tokens[i]
|
token = &tokens[i]
|
||||||
# Set the special tokens up to have arbitrary attributes
|
# Set the special tokens up to have arbitrary attributes
|
||||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
||||||
token.lex = lex
|
token.lex = lex
|
||||||
if TAG in props:
|
if TAG in props:
|
||||||
self.morphology.assign_tag(token, props[TAG])
|
self.morphology.assign_tag(token, props[TAG])
|
||||||
|
for feature in features:
|
||||||
|
self.morphology.set_feature(&token.morph, feature, True)
|
||||||
for attr_id, value in props.items():
|
for attr_id, value in props.items():
|
||||||
Token.set_struct_attr(token, attr_id, value)
|
Token.set_struct_attr(token, attr_id, value)
|
||||||
Lexeme.set_struct_attr(lex, attr_id, value)
|
Lexeme.set_struct_attr(lex, attr_id, value)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user