mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 10:55:52 +03:00
Fix Morphologizer
This commit is contained in:
parent
3908911da4
commit
d7ec1d62cb
|
@ -299,7 +299,7 @@ def get_token_conllu(token, i):
|
|||
head = 0
|
||||
else:
|
||||
head = i + (token.head.i - token.i) + 1
|
||||
features = token.vocab.morphology.get(token.morph_key)
|
||||
features = list(token.morph)
|
||||
feat_str = []
|
||||
replacements = {"one": "1", "two": "2", "three": "3"}
|
||||
for feat in features:
|
||||
|
|
|
@ -107,11 +107,11 @@ def get_field_id(feature):
|
|||
|
||||
|
||||
def get_field_size(field):
|
||||
return FIELD_SIZES[field]
|
||||
return FIELD_SIZES[FIELDS[field]]
|
||||
|
||||
|
||||
def get_field_offset(field):
|
||||
return FIELD_OFFSETS[field]
|
||||
return FIELD_OFFSETS[FIELDS[]]]]field]
|
||||
|
||||
|
||||
cdef class Morphology:
|
||||
|
@ -831,6 +831,8 @@ FEATURES = [
|
|||
"Aspect_mod",
|
||||
"Aspect_none",
|
||||
"Aspect_perf",
|
||||
"Aspect_prof",
|
||||
"Aspect_prosp",
|
||||
"Case_abe",
|
||||
"Case_abl",
|
||||
"Case_abs",
|
||||
|
@ -1074,6 +1076,6 @@ _seen_fields = Counter()
|
|||
for i, feature in enumerate(FEATURES):
|
||||
field = FEATURE_FIELDS[feature]
|
||||
FEATURE_OFFSETS[feature] = _seen_fields[field]
|
||||
if _seen_fields == 0:
|
||||
if _seen_fields[field] == 0:
|
||||
FIELD_OFFSETS[field] = i
|
||||
_seen_fields[field] += 1
|
||||
|
|
|
@ -81,16 +81,18 @@ class Morphologizer(Pipe):
|
|||
doc_scores = batch_scores[i]
|
||||
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
|
||||
# Convert the neuron indices into feature IDs.
|
||||
doc_feat_ids = self.model.ops.allocate((len(doc), len(field_names)), dtype='i')
|
||||
doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i')
|
||||
for j in range(len(doc)):
|
||||
for k, offset in enumerate(offsets):
|
||||
if doc_guesses[j, k] == 0:
|
||||
doc_feat_ids[j, k] = 0
|
||||
else:
|
||||
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
|
||||
# Get the set of feature names.
|
||||
feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0}
|
||||
# Now add the analysis, and set the hash.
|
||||
try:
|
||||
doc.c[j].morph = self.vocab.morphology.add(doc_feat_ids[j])
|
||||
doc.c[j].morph = self.vocab.morphology.add(feats)
|
||||
except:
|
||||
print(offsets)
|
||||
print(doc_guesses[j])
|
||||
|
@ -114,7 +116,12 @@ class Morphologizer(Pipe):
|
|||
guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
|
||||
guesses = self.model.ops.xp.vstack(guesses)
|
||||
scores = self.model.ops.xp.vstack(scores)
|
||||
if not isinstance(scores, numpy.ndarray):
|
||||
scores = scores.get()
|
||||
if not isinstance(guesses, numpy.ndarray):
|
||||
guesses = guesses.get()
|
||||
cdef int idx = 0
|
||||
# Do this on CPU, as we can't vectorize easily.
|
||||
target = numpy.zeros(scores.shape, dtype='f')
|
||||
field_sizes = self.model.softmax.out_sizes
|
||||
for gold in golds:
|
||||
|
@ -134,7 +141,8 @@ class Morphologizer(Pipe):
|
|||
target[idx, col_offset] = 1.
|
||||
col_offset += field_size
|
||||
idx += 1
|
||||
target = self.model.ops.xp.array(target, dtype='f')
|
||||
target = self.model.ops.asarray(target, dtype='f')
|
||||
scores = self.model.ops.asarray(scores, dtype='f')
|
||||
d_scores = scores - target
|
||||
loss = (d_scores**2).sum()
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
|
|
Loading…
Reference in New Issue
Block a user