mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Refactor morphologizer class map
This commit is contained in:
parent
f742900f83
commit
41a3016019
|
@ -20,6 +20,7 @@ cdef class Morphology:
|
||||||
cdef readonly object tag_names
|
cdef readonly object tag_names
|
||||||
cdef readonly object reverse_index
|
cdef readonly object reverse_index
|
||||||
cdef readonly object exc
|
cdef readonly object exc
|
||||||
|
cdef readonly object _feat_map
|
||||||
cdef readonly PreshMapArray _cache
|
cdef readonly PreshMapArray _cache
|
||||||
cdef readonly int n_tags
|
cdef readonly int n_tags
|
||||||
|
|
||||||
|
@ -36,6 +37,5 @@ cdef class Morphology:
|
||||||
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
|
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
|
||||||
cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
|
cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
|
||||||
cdef list list_features(const MorphAnalysisC* tag)
|
cdef list list_features(const MorphAnalysisC* tag)
|
||||||
cdef int attribute_to_field(unicode attribute)
|
|
||||||
|
|
||||||
cdef tag_to_json(const MorphAnalysisC* tag)
|
cdef tag_to_json(const MorphAnalysisC* tag)
|
||||||
|
|
|
@ -93,26 +93,34 @@ def _normalize_props(props):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def parse_feature(feature):
|
class MorphologyClassMap(object):
|
||||||
field = FEATURE_FIELDS[feature]
|
def __init__(self, features, fields):
|
||||||
offset = FEATURE_OFFSETS[feature]
|
self.features = tuple(features)
|
||||||
return (field, offset)
|
self.fields = tuple(fields)
|
||||||
|
self.id2feat = {get_string_id(name): name for name in features}
|
||||||
|
self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features}
|
||||||
|
self.field2feats = {}
|
||||||
|
self.col2info = []
|
||||||
|
self.attr2field = dict(LOWER_FIELDS.items())
|
||||||
|
for feature in features:
|
||||||
|
field = self.feat2field[feature]
|
||||||
|
if field not in self.field2feats:
|
||||||
|
self.col2info.append((field, 0, 'NIL'))
|
||||||
|
self.field2feats.setdefault(field, []).append(feature)
|
||||||
|
self.col2info.append((field, len(self.field2feats[field]), feature))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def field_sizes(self):
|
||||||
|
return [len(self.field2feats[field]) for field in self.fields]
|
||||||
|
|
||||||
cdef int attribute_to_field(unicode attribute_name):
|
def get_field_offset(self, field):
|
||||||
return LOWER_FIELDS[attribute_name]
|
n = 0
|
||||||
|
for f in self.fields:
|
||||||
|
if f == field:
|
||||||
def get_field_id(feature):
|
return n
|
||||||
return FEATURE_FIELDS[feature]
|
n += len(self.field2feats[f])
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
def get_field_size(field):
|
|
||||||
return FIELD_SIZES[FIELDS[field]]
|
|
||||||
|
|
||||||
|
|
||||||
def get_field_offset(field):
|
|
||||||
return FIELD_OFFSETS[FIELDS[field]]
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
|
@ -139,9 +147,11 @@ cdef class Morphology:
|
||||||
self.lemmatizer = lemmatizer
|
self.lemmatizer = lemmatizer
|
||||||
self.n_tags = len(tag_map)
|
self.n_tags = len(tag_map)
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
|
self._feat_map = MorphologyClassMap(FEATURES, FIELDS)
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
|
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
||||||
|
if feat in self._feat_map.id2feat})
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
self.reverse_index[self.strings.add(tag_str)] = i
|
self.reverse_index[self.strings.add(tag_str)] = i
|
||||||
|
|
||||||
|
@ -167,7 +177,7 @@ cdef class Morphology:
|
||||||
features = intify_features(features)
|
features = intify_features(features)
|
||||||
cdef attr_t feature
|
cdef attr_t feature
|
||||||
for feature in features:
|
for feature in features:
|
||||||
if feature != 0 and feature not in FEATURE_NAMES:
|
if feature != 0 and feature not in self._feat_map.id2feat:
|
||||||
raise KeyError("Unknown feature: %s" % self.strings[feature])
|
raise KeyError("Unknown feature: %s" % self.strings[feature])
|
||||||
cdef MorphAnalysisC tag
|
cdef MorphAnalysisC tag
|
||||||
tag = create_rich_tag(features)
|
tag = create_rich_tag(features)
|
||||||
|
@ -187,7 +197,7 @@ cdef class Morphology:
|
||||||
features = intify_features(features)
|
features = intify_features(features)
|
||||||
cdef attr_t feature
|
cdef attr_t feature
|
||||||
for feature in features:
|
for feature in features:
|
||||||
field = get_field_id(feature)
|
field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
|
||||||
set_feature(&tag, field, feature, 1)
|
set_feature(&tag, field, feature, 1)
|
||||||
morph = self.insert(tag)
|
morph = self.insert(tag)
|
||||||
return morph
|
return morph
|
||||||
|
@ -224,7 +234,8 @@ cdef class Morphology:
|
||||||
"""
|
"""
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
|
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
||||||
|
if feat in self._feat_map.id2feat})
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
|
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
|
||||||
|
|
||||||
|
@ -313,6 +324,10 @@ cdef class Morphology:
|
||||||
def from_disk(self, path):
|
def from_disk(self, path):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_class_map(cls):
|
||||||
|
return MorphologyClassMap(FEATURES, FIELDS)
|
||||||
|
|
||||||
|
|
||||||
cpdef univ_pos_t get_int_tag(pos_):
|
cpdef univ_pos_t get_int_tag(pos_):
|
||||||
return <univ_pos_t>0
|
return <univ_pos_t>0
|
||||||
|
@ -324,17 +339,12 @@ cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
|
||||||
return mrmr.hash64(&tag, sizeof(tag), 0)
|
return mrmr.hash64(&tag, sizeof(tag), 0)
|
||||||
|
|
||||||
|
|
||||||
def get_feature_field(feature):
|
|
||||||
cdef attr_t key = get_string_id(feature)
|
|
||||||
return FEATURE_FIELDS[feature]
|
|
||||||
|
|
||||||
|
|
||||||
cdef MorphAnalysisC create_rich_tag(features) except *:
|
cdef MorphAnalysisC create_rich_tag(features) except *:
|
||||||
cdef MorphAnalysisC tag
|
cdef MorphAnalysisC tag
|
||||||
cdef attr_t feature
|
cdef attr_t feature
|
||||||
memset(&tag, 0, sizeof(tag))
|
memset(&tag, 0, sizeof(tag))
|
||||||
for feature in features:
|
for feature in features:
|
||||||
field = get_field_id(feature)
|
field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
|
||||||
set_feature(&tag, field, feature, 1)
|
set_feature(&tag, field, feature, 1)
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
|
@ -519,8 +529,7 @@ cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
|
||||||
elif field == Field_VerbType:
|
elif field == Field_VerbType:
|
||||||
return tag.verb_type
|
return tag.verb_type
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
|
raise ValueError("Unknown field: (%d)" % field_id)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
|
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
|
||||||
|
@ -1090,22 +1099,5 @@ FEATURES = [
|
||||||
"Voice_int",
|
"Voice_int",
|
||||||
]
|
]
|
||||||
|
|
||||||
FEATURE_NAMES = {get_string_id(name): name for name in FEATURES}
|
FEATURE_NAMES = {get_string_id(f): f for f in FEATURES}
|
||||||
FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES}
|
FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES}
|
||||||
FIELD_SIZES = Counter(FEATURE_FIELDS.values())
|
|
||||||
for field in FIELD_SIZES:
|
|
||||||
FIELD_SIZES[field] += 1
|
|
||||||
for feat_id, name in FEATURE_NAMES.items():
|
|
||||||
FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name]
|
|
||||||
# Mapping of feature names to their position in total vector
|
|
||||||
FEATURE_OFFSETS = {}
|
|
||||||
# Mapping of field names to their first position in total vector.
|
|
||||||
FIELD_OFFSETS = {}
|
|
||||||
_seen_fields = Counter()
|
|
||||||
for i, feature in enumerate(FEATURES):
|
|
||||||
field = FEATURE_FIELDS[feature]
|
|
||||||
# Add 1 for the NIL class, on each field
|
|
||||||
FEATURE_OFFSETS[feature] = _seen_fields[field] + 1
|
|
||||||
if _seen_fields[field] == 0:
|
|
||||||
FIELD_OFFSETS[field] = i
|
|
||||||
_seen_fields[field] += 1
|
|
||||||
|
|
|
@ -16,26 +16,24 @@ from ..compat import basestring_
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS
|
|
||||||
from ..morphology import FEATURES
|
|
||||||
|
|
||||||
|
|
||||||
class Morphologizer(Pipe):
|
class Morphologizer(Pipe):
|
||||||
name = 'morphologizer'
|
name = 'morphologizer'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, attr_nums=None, **cfg):
|
def Model(cls, **cfg):
|
||||||
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
|
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
|
||||||
raise ValueError(TempErrors.T008)
|
raise ValueError(TempErrors.T008)
|
||||||
if attr_nums is None:
|
class_map = Morphology.create_class_map()
|
||||||
attr_nums = [get_field_size(name) for name in FIELDS]
|
return build_morphologizer_model(class_map.field_sizes, **cfg)
|
||||||
return build_morphologizer_model(attr_nums, **cfg)
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = OrderedDict(sorted(cfg.items()))
|
self.cfg = OrderedDict(sorted(cfg.items()))
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
|
self._class_map = self.vocab.morphology.create_class_map()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -76,13 +74,13 @@ class Morphologizer(Pipe):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
field_names = list(FIELDS)
|
offsets = [self._class_map.get_field_offset(field)
|
||||||
offsets = [get_field_offset(field) for field in field_names]
|
for field in self._class_map.fields]
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_scores = batch_scores[i]
|
doc_scores = batch_scores[i]
|
||||||
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
|
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
|
||||||
# Convert the neuron indices into feature IDs.
|
# Convert the neuron indices into feature IDs.
|
||||||
doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i')
|
doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
|
||||||
for j in range(len(doc)):
|
for j in range(len(doc)):
|
||||||
for k, offset in enumerate(offsets):
|
for k, offset in enumerate(offsets):
|
||||||
if doc_guesses[j, k] == 0:
|
if doc_guesses[j, k] == 0:
|
||||||
|
@ -90,7 +88,8 @@ class Morphologizer(Pipe):
|
||||||
else:
|
else:
|
||||||
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
|
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
|
||||||
# Get the set of feature names.
|
# Get the set of feature names.
|
||||||
feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0}
|
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]
|
||||||
|
if f != 0}
|
||||||
# Now add the analysis, and set the hash.
|
# Now add the analysis, and set the hash.
|
||||||
try:
|
try:
|
||||||
doc.c[j].morph = self.vocab.morphology.add(feats)
|
doc.c[j].morph = self.vocab.morphology.add(feats)
|
||||||
|
@ -132,14 +131,15 @@ class Morphologizer(Pipe):
|
||||||
if features is None:
|
if features is None:
|
||||||
target[idx] = scores[idx]
|
target[idx] = scores[idx]
|
||||||
else:
|
else:
|
||||||
by_field = {}
|
gold_fields = {}
|
||||||
for feature in features:
|
for feature in features:
|
||||||
field, column = parse_feature(feature)
|
field = self.get_field(feature)
|
||||||
by_field[field] = column
|
column = self.get_column(feature)
|
||||||
|
gold_fields[field] = column
|
||||||
col_offset = 0
|
col_offset = 0
|
||||||
for field, field_size in enumerate(field_sizes):
|
for field, field_size in enumerate(field_sizes):
|
||||||
if field in by_field:
|
if field in gold_fields:
|
||||||
target[idx, col_offset + by_field[field]] = 1.
|
target[idx, col_offset + gold_fields[field]] = 1.
|
||||||
else:
|
else:
|
||||||
target[idx, col_offset] = 1.
|
target[idx, col_offset] = 1.
|
||||||
col_offset += field_size
|
col_offset += field_size
|
||||||
|
|
|
@ -3,7 +3,6 @@ from libc.string cimport memset
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..typedefs cimport hash_t, attr_t
|
from ..typedefs cimport hash_t, attr_t
|
||||||
from ..morphology cimport list_features, check_feature, get_field, tag_to_json
|
from ..morphology cimport list_features, check_feature, get_field, tag_to_json
|
||||||
from ..morphology cimport attribute_to_field
|
|
||||||
|
|
||||||
from ..strings import get_string_id
|
from ..strings import get_string_id
|
||||||
|
|
||||||
|
@ -53,7 +52,7 @@ cdef class MorphAnalysis:
|
||||||
return self.key
|
return self.key
|
||||||
|
|
||||||
def get(self, unicode field):
|
def get(self, unicode field):
|
||||||
cdef int field_id = attribute_to_field(field)
|
cdef int field_id = self.vocab.morphology._feat_map.attr2field[field]
|
||||||
return self.vocab.strings[get_field(&self.c, field_id)]
|
return self.vocab.strings[get_field(&self.c, field_id)]
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user