diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index afef6c073..44ecababe 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -299,7 +299,7 @@ def get_token_conllu(token, i): head = 0 else: head = i + (token.head.i - token.i) + 1 - features = token.vocab.morphology.get(token.morph_key) + features = list(token.morph) feat_str = [] replacements = {"one": "1", "two": "2", "three": "3"} for feat in features: diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 63d0291ff..2f3e8d1fa 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -107,11 +107,11 @@ def get_field_id(feature): def get_field_size(field): - return FIELD_SIZES[field] + return FIELD_SIZES[FIELDS[field]] def get_field_offset(field): - return FIELD_OFFSETS[field] + return FIELD_OFFSETS[FIELDS[]]]]field] cdef class Morphology: @@ -831,6 +831,8 @@ FEATURES = [ "Aspect_mod", "Aspect_none", "Aspect_perf", + "Aspect_prof", + "Aspect_prosp", "Case_abe", "Case_abl", "Case_abs", @@ -1074,6 +1076,6 @@ _seen_fields = Counter() for i, feature in enumerate(FEATURES): field = FEATURE_FIELDS[feature] FEATURE_OFFSETS[feature] = _seen_fields[field] - if _seen_fields == 0: + if _seen_fields[field] == 0: FIELD_OFFSETS[field] = i _seen_fields[field] += 1 diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 9f25ba357..223bb6ec5 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -81,16 +81,18 @@ class Morphologizer(Pipe): doc_scores = batch_scores[i] doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) # Convert the neuron indices into feature IDs. - doc_feat_ids = self.model.ops.allocate((len(doc), len(field_names)), dtype='i') + doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i') for j in range(len(doc)): for k, offset in enumerate(offsets): if doc_guesses[j, k] == 0: doc_feat_ids[j, k] = 0 else: doc_feat_ids[j, k] = offset + doc_guesses[j, k] + # Get the set of feature names. + feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0} # Now add the analysis, and set the hash. try: - doc.c[j].morph = self.vocab.morphology.add(doc_feat_ids[j]) + doc.c[j].morph = self.vocab.morphology.add(feats) except: print(offsets) print(doc_guesses[j]) @@ -114,7 +116,12 @@ class Morphologizer(Pipe): guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) guesses = self.model.ops.xp.vstack(guesses) scores = self.model.ops.xp.vstack(scores) + if not isinstance(scores, numpy.ndarray): + scores = scores.get() + if not isinstance(guesses, numpy.ndarray): + guesses = guesses.get() cdef int idx = 0 + # Do this on CPU, as we can't vectorize easily. target = numpy.zeros(scores.shape, dtype='f') field_sizes = self.model.softmax.out_sizes for gold in golds: @@ -134,7 +141,8 @@ class Morphologizer(Pipe): target[idx, col_offset] = 1. col_offset += field_size idx += 1 - target = self.model.ops.xp.array(target, dtype='f') + target = self.model.ops.asarray(target, dtype='f') + scores = self.model.ops.asarray(scores, dtype='f') d_scores = scores - target loss = (d_scores**2).sum() d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])