mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 19:08:06 +03:00
Fix Morphologizer
This commit is contained in:
parent
3908911da4
commit
d7ec1d62cb
|
@ -299,7 +299,7 @@ def get_token_conllu(token, i):
|
||||||
head = 0
|
head = 0
|
||||||
else:
|
else:
|
||||||
head = i + (token.head.i - token.i) + 1
|
head = i + (token.head.i - token.i) + 1
|
||||||
features = token.vocab.morphology.get(token.morph_key)
|
features = list(token.morph)
|
||||||
feat_str = []
|
feat_str = []
|
||||||
replacements = {"one": "1", "two": "2", "three": "3"}
|
replacements = {"one": "1", "two": "2", "three": "3"}
|
||||||
for feat in features:
|
for feat in features:
|
||||||
|
|
|
@ -107,11 +107,11 @@ def get_field_id(feature):
|
||||||
|
|
||||||
|
|
||||||
def get_field_size(field):
|
def get_field_size(field):
|
||||||
return FIELD_SIZES[field]
|
return FIELD_SIZES[FIELDS[field]]
|
||||||
|
|
||||||
|
|
||||||
def get_field_offset(field):
|
def get_field_offset(field):
|
||||||
return FIELD_OFFSETS[field]
|
return FIELD_OFFSETS[FIELDS[]]]]field]
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
|
@ -831,6 +831,8 @@ FEATURES = [
|
||||||
"Aspect_mod",
|
"Aspect_mod",
|
||||||
"Aspect_none",
|
"Aspect_none",
|
||||||
"Aspect_perf",
|
"Aspect_perf",
|
||||||
|
"Aspect_prof",
|
||||||
|
"Aspect_prosp",
|
||||||
"Case_abe",
|
"Case_abe",
|
||||||
"Case_abl",
|
"Case_abl",
|
||||||
"Case_abs",
|
"Case_abs",
|
||||||
|
@ -1074,6 +1076,6 @@ _seen_fields = Counter()
|
||||||
for i, feature in enumerate(FEATURES):
|
for i, feature in enumerate(FEATURES):
|
||||||
field = FEATURE_FIELDS[feature]
|
field = FEATURE_FIELDS[feature]
|
||||||
FEATURE_OFFSETS[feature] = _seen_fields[field]
|
FEATURE_OFFSETS[feature] = _seen_fields[field]
|
||||||
if _seen_fields == 0:
|
if _seen_fields[field] == 0:
|
||||||
FIELD_OFFSETS[field] = i
|
FIELD_OFFSETS[field] = i
|
||||||
_seen_fields[field] += 1
|
_seen_fields[field] += 1
|
||||||
|
|
|
@ -81,16 +81,18 @@ class Morphologizer(Pipe):
|
||||||
doc_scores = batch_scores[i]
|
doc_scores = batch_scores[i]
|
||||||
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
|
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
|
||||||
# Convert the neuron indices into feature IDs.
|
# Convert the neuron indices into feature IDs.
|
||||||
doc_feat_ids = self.model.ops.allocate((len(doc), len(field_names)), dtype='i')
|
doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i')
|
||||||
for j in range(len(doc)):
|
for j in range(len(doc)):
|
||||||
for k, offset in enumerate(offsets):
|
for k, offset in enumerate(offsets):
|
||||||
if doc_guesses[j, k] == 0:
|
if doc_guesses[j, k] == 0:
|
||||||
doc_feat_ids[j, k] = 0
|
doc_feat_ids[j, k] = 0
|
||||||
else:
|
else:
|
||||||
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
|
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
|
||||||
|
# Get the set of feature names.
|
||||||
|
feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0}
|
||||||
# Now add the analysis, and set the hash.
|
# Now add the analysis, and set the hash.
|
||||||
try:
|
try:
|
||||||
doc.c[j].morph = self.vocab.morphology.add(doc_feat_ids[j])
|
doc.c[j].morph = self.vocab.morphology.add(feats)
|
||||||
except:
|
except:
|
||||||
print(offsets)
|
print(offsets)
|
||||||
print(doc_guesses[j])
|
print(doc_guesses[j])
|
||||||
|
@ -114,7 +116,12 @@ class Morphologizer(Pipe):
|
||||||
guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
|
guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
|
||||||
guesses = self.model.ops.xp.vstack(guesses)
|
guesses = self.model.ops.xp.vstack(guesses)
|
||||||
scores = self.model.ops.xp.vstack(scores)
|
scores = self.model.ops.xp.vstack(scores)
|
||||||
|
if not isinstance(scores, numpy.ndarray):
|
||||||
|
scores = scores.get()
|
||||||
|
if not isinstance(guesses, numpy.ndarray):
|
||||||
|
guesses = guesses.get()
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
|
# Do this on CPU, as we can't vectorize easily.
|
||||||
target = numpy.zeros(scores.shape, dtype='f')
|
target = numpy.zeros(scores.shape, dtype='f')
|
||||||
field_sizes = self.model.softmax.out_sizes
|
field_sizes = self.model.softmax.out_sizes
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
|
@ -134,7 +141,8 @@ class Morphologizer(Pipe):
|
||||||
target[idx, col_offset] = 1.
|
target[idx, col_offset] = 1.
|
||||||
col_offset += field_size
|
col_offset += field_size
|
||||||
idx += 1
|
idx += 1
|
||||||
target = self.model.ops.xp.array(target, dtype='f')
|
target = self.model.ops.asarray(target, dtype='f')
|
||||||
|
scores = self.model.ops.asarray(scores, dtype='f')
|
||||||
d_scores = scores - target
|
d_scores = scores - target
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user