Wire up morphological features

This commit is contained in:
Matthew Honnibal 2018-02-25 21:22:45 +01:00
parent 9b406181cd
commit 9c32388235
9 changed files with 201 additions and 130 deletions

View File

@ -143,8 +143,10 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items(): for name, value in stringy_attrs.items():
if isinstance(name, int): if isinstance(name, int):
int_key = name int_key = name
else: elif name.upper() in IDS:
int_key = IDS[name.upper()] int_key = IDS[name.upper()]
else:
continue
if strings_map is not None and isinstance(value, basestring): if strings_map is not None and isinstance(value, basestring):
if hasattr(strings_map, 'add'): if hasattr(strings_map, 'add'):
value = strings_map.add(value) value = strings_map.add(value)

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
from ...morphology import Fused_begin, Fused_inside
_exc = { _exc = {
@ -47,7 +48,11 @@ _exc = {
"über'm": [ "über'm": [
{ORTH: "über", LEMMA: "über"}, {ORTH: "über", LEMMA: "über"},
{ORTH: "'m", LEMMA: "der", NORM: "dem"}] {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
"zum": [
{ORTH: "zu", LEMMA: "zu", "morphology": [Fused_begin]},
{ORTH: "m", LEMMA: "der", "morphology": [Fused_inside]}
]
} }

View File

@ -31,6 +31,7 @@ cdef class Morphology:
cdef public object reverse_index cdef public object reverse_index
cdef public object tag_names cdef public object tag_names
cdef public object exc cdef public object exc
cdef public object _morph2features
cdef RichTagC* rich_tags cdef RichTagC* rich_tags
cdef PreshMapArray _cache cdef PreshMapArray _cache
@ -43,6 +44,8 @@ cdef class Morphology:
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
cdef int set_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
cdef enum univ_morph_t: cdef enum univ_morph_t:
NIL = 0 NIL = 0
@ -298,4 +301,7 @@ cdef enum univ_morph_t:
VerbType_mod # U VerbType_mod # U
VerbType_light # U VerbType_light # U
Fused_begin
Fused_inside

View File

@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string
def _normalize_props(props): def _normalize_props(props):
@ -29,6 +30,11 @@ def _normalize_props(props):
out[key] = value out[key] = value
return out return out
cdef uint64_t hash_features(features):
# TODO improve this
cdef unicode string = str(tuple(features))
return hash_string(string)
cdef class Morphology: cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
@ -36,7 +42,7 @@ cdef class Morphology:
self.strings = string_store self.strings = string_store
# Add special space symbol. We prefix with underscore, to make sure it # Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end. # always sorts to the end.
space_attrs = tag_map.get('SP', {POS: SPACE}) space_attrs = tag_map.get('_SP', tag_map.get('SP', {POS: SPACE}))
if '_SP' not in tag_map: if '_SP' not in tag_map:
self.strings.add('_SP') self.strings.add('_SP')
tag_map = dict(tag_map) tag_map = dict(tag_map)
@ -48,16 +54,19 @@ cdef class Morphology:
self.reverse_index = {} self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
self._morph2features = {}
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
features = attrs.get('morphology', frozenset())
self.strings.add(tag_str) self.strings.add(tag_str)
self.tag_map[tag_str] = dict(attrs) self.tag_map[tag_str] = dict(attrs)
attrs = _normalize_props(attrs) attrs = _normalize_props(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.rich_tags[i].id = i self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings.add(tag_str) self.rich_tags[i].name = self.strings.add(tag_str)
self.rich_tags[i].morph = 0 self.rich_tags[i].morph = hash_features(features)
self.rich_tags[i].pos = attrs[POS] self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i self.reverse_index[self.rich_tags[i].name] = i
self._morph2features[self.rich_tags[i].morph] = features
# Add a 'null' tag, which we can reference when assign morphology to # Add a 'null' tag, which we can reference when assign morphology to
# untagged tokens. # untagged tokens.
self.rich_tags[self.n_tags].id = self.n_tags self.rich_tags[self.n_tags].id = self.n_tags
@ -114,12 +123,30 @@ cdef class Morphology:
token.tag = analysis.tag.name token.tag = analysis.tag.name
token.morph = analysis.tag.morph token.morph = analysis.tag.morph
cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1: cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
cdef flags_t one = 1 # Deprecated
pass
cdef int set_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
'''Update a morph attribute in-place, so that it indicates the given
feature.
'''
features = self._morph2features.get(morph[0], {})
cdef uint64_t key
cdef attr_t flag = flag_id
if (flag in features) != value:
new_features = set(features)
if value: if value:
flags[0] |= one << flag_id new_features.add(flag)
else: else:
flags[0] &= ~(one << flag_id) new_features.remove(flag)
new_features = frozenset(new_features)
key = hash_features(new_features)
morph[0] = key
self._morph2features[morph[0]] = new_features
def get_features(self, uint64_t morph):
return self._morph2features.get(morph, frozenset())
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False): force=False):
@ -140,6 +167,9 @@ cdef class Morphology:
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
orth = self.strings[orth_str] orth = self.strings[orth_str]
cdef RichTagC rich_tag = self.rich_tags[tag_id] cdef RichTagC rich_tag = self.rich_tags[tag_id]
features = attrs.get('morphology', frozenset())
cdef uint64_t morph = hash_features(features)
self._morph2features[morph] = features
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth) cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
if cached is NULL: if cached is NULL:
@ -152,12 +182,11 @@ cdef class Morphology:
"force=True to overwrite." % (tag_str, orth_str)) "force=True to overwrite." % (tag_str, orth_str))
cached.tag = rich_tag cached.tag = rich_tag
cached.tag.morph = morph
# TODO: Refactor this to take arbitrary attributes. # TODO: Refactor this to take arbitrary attributes.
for name_id, value_id in attrs.items(): for name_id, value_id in attrs.items():
if name_id == LEMMA: if name_id == LEMMA:
cached.lemma = value_id cached.lemma = value_id
else:
self.assign_feature(&cached.tag.morph, name_id, value_id)
if cached.lemma == 0: if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs) cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
self._cache.set(tag_id, orth, <void*>cached) self._cache.set(tag_id, orth, <void*>cached)
@ -434,6 +463,9 @@ IDS = {
"VerbType_cop": VerbType_cop, # U, "VerbType_cop": VerbType_cop, # U,
"VerbType_mod": VerbType_mod, # U, "VerbType_mod": VerbType_mod, # U,
"VerbType_light": VerbType_light, # U, "VerbType_light": VerbType_light, # U,
"Fused_begin": Fused_begin, # Internal
"Fused_inside": Fused_inside # Internal
} }

View File

@ -385,6 +385,9 @@ cdef enum symbol_t:
VerbType_mod # U VerbType_mod # U
VerbType_light # U VerbType_light # U
Fused_begin
Fused_inside
PERSON PERSON
NORP NORP
FACILITY FACILITY

View File

@ -390,6 +390,9 @@ IDS = {
"VerbType_mod": VerbType_mod, # U, "VerbType_mod": VerbType_mod, # U,
"VerbType_light": VerbType_light, # U, "VerbType_light": VerbType_light, # U,
"Fused_begin": Fused_begin,
"Fused_inside": Fused_inside,
"PERSON": PERSON, "PERSON": PERSON,
"NORP": NORP, "NORP": NORP,
"FACILITY": FACILITY, "FACILITY": FACILITY,

View File

@ -5,6 +5,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from ....morphology import Fused_begin, Fused_inside
def test_tokenizer_handles_long_text(de_tokenizer): def test_tokenizer_handles_long_text(de_tokenizer):
@ -22,9 +23,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
»Was ist mit mir geschehen?«, dachte er.""" »Was ist mit mir geschehen?«, dachte er."""
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == 109 assert len(tokens) == 110
def test_fused(de_tokenizer):
doc = de_tokenizer('zum')
assert len(doc) == 2
assert doc[0].check_morph(Fused_begin)
assert doc[1].check_morph(Fused_inside)
@pytest.mark.parametrize('text', [ @pytest.mark.parametrize('text', [
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",

View File

@ -10,6 +10,7 @@ cimport numpy as np
np.import_array() np.import_array()
import numpy import numpy
from ..morphology cimport univ_morph_t
from ..typedefs cimport hash_t from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .. import parts_of_speech from .. import parts_of_speech
@ -128,6 +129,15 @@ cdef class Token:
""" """
return Lexeme.c_check_flag(self.c.lex, flag_id) return Lexeme.c_check_flag(self.c.lex, flag_id)
def set_morph(self, univ_morph_t feature, bint value):
'''Set a morphological feature'''
self.vocab.morphology.set_feature(&self.c.morph, feature, value)
def check_morph(self, univ_morph_t feature):
'''Check whether the token has the given morphological feature.'''
features = self.vocab.morphology.get_features(self.c.morph)
return feature in features
def nbor(self, int i=1): def nbor(self, int i=1):
"""Get a neighboring token. """Get a neighboring token.

View File

@ -232,14 +232,17 @@ cdef class Vocab:
cdef int i cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings): for i, props in enumerate(substrings):
features = props.get('morphology', frozenset())
props = intify_attrs(props, strings_map=self.strings, props = intify_attrs(props, strings_map=self.strings,
_do_deprecated=True) _do_deprecated=False)
token = &tokens[i] token = &tokens[i]
# Set the special tokens up to have arbitrary attributes # Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH]) lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
token.lex = lex token.lex = lex
if TAG in props: if TAG in props:
self.morphology.assign_tag(token, props[TAG]) self.morphology.assign_tag(token, props[TAG])
for feature in features:
self.morphology.set_feature(&token.morph, feature, True)
for attr_id, value in props.items(): for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)
Lexeme.set_struct_attr(lex, attr_id, value) Lexeme.set_struct_attr(lex, attr_id, value)