mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Refactor morphologizer class map
This commit is contained in:
parent
f742900f83
commit
41a3016019
|
@ -20,6 +20,7 @@ cdef class Morphology:
|
|||
cdef readonly object tag_names
|
||||
cdef readonly object reverse_index
|
||||
cdef readonly object exc
|
||||
cdef readonly object _feat_map
|
||||
cdef readonly PreshMapArray _cache
|
||||
cdef readonly int n_tags
|
||||
|
||||
|
@ -36,6 +37,5 @@ cdef class Morphology:
|
|||
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
|
||||
cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
|
||||
cdef list list_features(const MorphAnalysisC* tag)
|
||||
cdef int attribute_to_field(unicode attribute)
|
||||
|
||||
cdef tag_to_json(const MorphAnalysisC* tag)
|
||||
|
|
|
@ -93,26 +93,34 @@ def _normalize_props(props):
|
|||
return out
|
||||
|
||||
|
||||
def parse_feature(feature):
|
||||
field = FEATURE_FIELDS[feature]
|
||||
offset = FEATURE_OFFSETS[feature]
|
||||
return (field, offset)
|
||||
class MorphologyClassMap(object):
|
||||
def __init__(self, features, fields):
|
||||
self.features = tuple(features)
|
||||
self.fields = tuple(fields)
|
||||
self.id2feat = {get_string_id(name): name for name in features}
|
||||
self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features}
|
||||
self.field2feats = {}
|
||||
self.col2info = []
|
||||
self.attr2field = dict(LOWER_FIELDS.items())
|
||||
for feature in features:
|
||||
field = self.feat2field[feature]
|
||||
if field not in self.field2feats:
|
||||
self.col2info.append((field, 0, 'NIL'))
|
||||
self.field2feats.setdefault(field, []).append(feature)
|
||||
self.col2info.append((field, len(self.field2feats[field]), feature))
|
||||
|
||||
@property
|
||||
def field_sizes(self):
|
||||
return [len(self.field2feats[field]) for field in self.fields]
|
||||
|
||||
cdef int attribute_to_field(unicode attribute_name):
|
||||
return LOWER_FIELDS[attribute_name]
|
||||
|
||||
|
||||
def get_field_id(feature):
|
||||
return FEATURE_FIELDS[feature]
|
||||
|
||||
|
||||
def get_field_size(field):
|
||||
return FIELD_SIZES[FIELDS[field]]
|
||||
|
||||
|
||||
def get_field_offset(field):
|
||||
return FIELD_OFFSETS[FIELDS[field]]
|
||||
def get_field_offset(self, field):
|
||||
n = 0
|
||||
for f in self.fields:
|
||||
if f == field:
|
||||
return n
|
||||
n += len(self.field2feats[f])
|
||||
else:
|
||||
return -1
|
||||
|
||||
|
||||
cdef class Morphology:
|
||||
|
@ -139,9 +147,11 @@ cdef class Morphology:
|
|||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map)
|
||||
self.reverse_index = {}
|
||||
self._feat_map = MorphologyClassMap(FEATURES, FIELDS)
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
|
||||
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
||||
if feat in self._feat_map.id2feat})
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
self.reverse_index[self.strings.add(tag_str)] = i
|
||||
|
||||
|
@ -167,7 +177,7 @@ cdef class Morphology:
|
|||
features = intify_features(features)
|
||||
cdef attr_t feature
|
||||
for feature in features:
|
||||
if feature != 0 and feature not in FEATURE_NAMES:
|
||||
if feature != 0 and feature not in self._feat_map.id2feat:
|
||||
raise KeyError("Unknown feature: %s" % self.strings[feature])
|
||||
cdef MorphAnalysisC tag
|
||||
tag = create_rich_tag(features)
|
||||
|
@ -187,7 +197,7 @@ cdef class Morphology:
|
|||
features = intify_features(features)
|
||||
cdef attr_t feature
|
||||
for feature in features:
|
||||
field = get_field_id(feature)
|
||||
field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
|
||||
set_feature(&tag, field, feature, 1)
|
||||
morph = self.insert(tag)
|
||||
return morph
|
||||
|
@ -224,7 +234,8 @@ cdef class Morphology:
|
|||
"""
|
||||
attrs = dict(attrs)
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
|
||||
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
||||
if feat in self._feat_map.id2feat})
|
||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
|
||||
|
||||
|
@ -313,6 +324,10 @@ cdef class Morphology:
|
|||
def from_disk(self, path):
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def create_class_map(cls):
|
||||
return MorphologyClassMap(FEATURES, FIELDS)
|
||||
|
||||
|
||||
cpdef univ_pos_t get_int_tag(pos_):
|
||||
return <univ_pos_t>0
|
||||
|
@ -324,17 +339,12 @@ cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
|
|||
return mrmr.hash64(&tag, sizeof(tag), 0)
|
||||
|
||||
|
||||
def get_feature_field(feature):
|
||||
cdef attr_t key = get_string_id(feature)
|
||||
return FEATURE_FIELDS[feature]
|
||||
|
||||
|
||||
cdef MorphAnalysisC create_rich_tag(features) except *:
|
||||
cdef MorphAnalysisC tag
|
||||
cdef attr_t feature
|
||||
memset(&tag, 0, sizeof(tag))
|
||||
for feature in features:
|
||||
field = get_field_id(feature)
|
||||
field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
|
||||
set_feature(&tag, field, feature, 1)
|
||||
return tag
|
||||
|
||||
|
@ -519,8 +529,7 @@ cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
|
|||
elif field == Field_VerbType:
|
||||
return tag.verb_type
|
||||
else:
|
||||
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
|
||||
|
||||
raise ValueError("Unknown field: (%d)" % field_id)
|
||||
|
||||
|
||||
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
|
||||
|
@ -1090,22 +1099,5 @@ FEATURES = [
|
|||
"Voice_int",
|
||||
]
|
||||
|
||||
FEATURE_NAMES = {get_string_id(name): name for name in FEATURES}
|
||||
FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES}
|
||||
FIELD_SIZES = Counter(FEATURE_FIELDS.values())
|
||||
for field in FIELD_SIZES:
|
||||
FIELD_SIZES[field] += 1
|
||||
for feat_id, name in FEATURE_NAMES.items():
|
||||
FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name]
|
||||
# Mapping of feature names to their position in total vector
|
||||
FEATURE_OFFSETS = {}
|
||||
# Mapping of field names to their first position in total vector.
|
||||
FIELD_OFFSETS = {}
|
||||
_seen_fields = Counter()
|
||||
for i, feature in enumerate(FEATURES):
|
||||
field = FEATURE_FIELDS[feature]
|
||||
# Add 1 for the NIL class, on each field
|
||||
FEATURE_OFFSETS[feature] = _seen_fields[field] + 1
|
||||
if _seen_fields[field] == 0:
|
||||
FIELD_OFFSETS[field] = i
|
||||
_seen_fields[field] += 1
|
||||
FEATURE_NAMES = {get_string_id(f): f for f in FEATURES}
|
||||
FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES}
|
||||
|
|
|
@ -16,26 +16,24 @@ from ..compat import basestring_
|
|||
from ..tokens.doc cimport Doc
|
||||
from ..vocab cimport Vocab
|
||||
from ..morphology cimport Morphology
|
||||
from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS
|
||||
from ..morphology import FEATURES
|
||||
|
||||
|
||||
class Morphologizer(Pipe):
|
||||
name = 'morphologizer'
|
||||
|
||||
@classmethod
|
||||
def Model(cls, attr_nums=None, **cfg):
|
||||
def Model(cls, **cfg):
|
||||
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
|
||||
raise ValueError(TempErrors.T008)
|
||||
if attr_nums is None:
|
||||
attr_nums = [get_field_size(name) for name in FIELDS]
|
||||
return build_morphologizer_model(attr_nums, **cfg)
|
||||
class_map = Morphology.create_class_map()
|
||||
return build_morphologizer_model(class_map.field_sizes, **cfg)
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = OrderedDict(sorted(cfg.items()))
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self._class_map = self.vocab.morphology.create_class_map()
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -76,13 +74,13 @@ class Morphologizer(Pipe):
|
|||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef Vocab vocab = self.vocab
|
||||
field_names = list(FIELDS)
|
||||
offsets = [get_field_offset(field) for field in field_names]
|
||||
offsets = [self._class_map.get_field_offset(field)
|
||||
for field in self._class_map.fields]
|
||||
for i, doc in enumerate(docs):
|
||||
doc_scores = batch_scores[i]
|
||||
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
|
||||
# Convert the neuron indices into feature IDs.
|
||||
doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i')
|
||||
doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
|
||||
for j in range(len(doc)):
|
||||
for k, offset in enumerate(offsets):
|
||||
if doc_guesses[j, k] == 0:
|
||||
|
@ -90,7 +88,8 @@ class Morphologizer(Pipe):
|
|||
else:
|
||||
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
|
||||
# Get the set of feature names.
|
||||
feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0}
|
||||
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]
|
||||
if f != 0}
|
||||
# Now add the analysis, and set the hash.
|
||||
try:
|
||||
doc.c[j].morph = self.vocab.morphology.add(feats)
|
||||
|
@ -132,14 +131,15 @@ class Morphologizer(Pipe):
|
|||
if features is None:
|
||||
target[idx] = scores[idx]
|
||||
else:
|
||||
by_field = {}
|
||||
gold_fields = {}
|
||||
for feature in features:
|
||||
field, column = parse_feature(feature)
|
||||
by_field[field] = column
|
||||
field = self.get_field(feature)
|
||||
column = self.get_column(feature)
|
||||
gold_fields[field] = column
|
||||
col_offset = 0
|
||||
for field, field_size in enumerate(field_sizes):
|
||||
if field in by_field:
|
||||
target[idx, col_offset + by_field[field]] = 1.
|
||||
if field in gold_fields:
|
||||
target[idx, col_offset + gold_fields[field]] = 1.
|
||||
else:
|
||||
target[idx, col_offset] = 1.
|
||||
col_offset += field_size
|
||||
|
|
|
@ -3,7 +3,6 @@ from libc.string cimport memset
|
|||
from ..vocab cimport Vocab
|
||||
from ..typedefs cimport hash_t, attr_t
|
||||
from ..morphology cimport list_features, check_feature, get_field, tag_to_json
|
||||
from ..morphology cimport attribute_to_field
|
||||
|
||||
from ..strings import get_string_id
|
||||
|
||||
|
@ -53,7 +52,7 @@ cdef class MorphAnalysis:
|
|||
return self.key
|
||||
|
||||
def get(self, unicode field):
|
||||
cdef int field_id = attribute_to_field(field)
|
||||
cdef int field_id = self.vocab.morphology._feat_map.attr2field[field]
|
||||
return self.vocab.strings[get_field(&self.c, field_id)]
|
||||
|
||||
def to_json(self):
|
||||
|
|
Loading…
Reference in New Issue
Block a user