diff --git a/spacy/_ml.py b/spacy/_ml.py index 85d80c3f1..2e4df843c 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -561,7 +561,7 @@ def build_morphologizer_model(class_nums, **cfg): token_vector_width = util.env_opt("token_vector_width", 128) pretrained_vectors = cfg.get("pretrained_vectors") char_embed = cfg.get("char_embed", True) - with Model.define_operators({">>": chain, "+": add}): + with Model.define_operators({">>": chain, "+": add, "**": clone}): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] else: @@ -571,7 +571,9 @@ def build_morphologizer_model(class_nums, **cfg): char_embed=char_embed, pretrained_vectors=pretrained_vectors, ) - softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) + softmax = with_flatten( + MultiSoftmax(class_nums, token_vector_width) + ) softmax.out_sizes = class_nums model = tok2vec >> softmax model.nI = None diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fdaa44813..6c9ecebdc 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -46,8 +46,8 @@ cdef enum univ_field_t: Field_PartForm Field_PartType Field_Person - Field_Polite Field_Polarity + Field_Polite Field_Poss Field_Prefix Field_PrepCase @@ -60,8 +60,8 @@ cdef enum univ_field_t: Field_Tense Field_Typo Field_VerbForm - Field_Voice Field_VerbType + Field_Voice def _normalize_props(props): @@ -94,20 +94,36 @@ def _normalize_props(props): class MorphologyClassMap(object): - def __init__(self, features, fields): + def __init__(self, features): self.features = tuple(features) - self.fields = tuple(fields) + self.fields = [] + self.feat2field = {} + seen_fields = set() + for feature in features: + field = feature.split("_", 1)[0] + if field not in seen_fields: + self.fields.append(field) + seen_fields.add(field) + self.feat2field[feature] = FIELDS[field] self.id2feat = {get_string_id(name): name for name in features} - self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features} - self.field2feats = {} + self.field2feats = {"POS": []} self.col2info = [] self.attr2field = dict(LOWER_FIELDS.items()) + self.feat2offset = {} + self.field2col = {} + self.field2id = dict(FIELDS.items()) + self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()} for feature in features: - field = self.feat2field[feature] - if field not in self.field2feats: - self.col2info.append((field, 0, 'NIL')) - self.field2feats.setdefault(field, []).append(feature) - self.col2info.append((field, len(self.field2feats[field]), feature)) + field = self.fields[self.feat2field[feature]] + if field not in self.field2col: + self.field2col[field] = len(self.col2info) + if field != "POS" and field not in self.field2feats: + self.col2info.append((field, 0, "NIL")) + self.field2feats.setdefault(field, ["NIL"]) + offset = len(self.field2feats[field]) + self.field2feats[field].append(feature) + self.col2info.append((field, offset, feature)) + self.feat2offset[feature] = offset @property def field_sizes(self): @@ -147,7 +163,7 @@ cdef class Morphology: self.lemmatizer = lemmatizer self.n_tags = len(tag_map) self.reverse_index = {} - self._feat_map = MorphologyClassMap(FEATURES, FIELDS) + self._feat_map = MorphologyClassMap(FEATURES) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) self.add({self._feat_map.id2feat[feat] for feat in attrs @@ -326,7 +342,7 @@ cdef class Morphology: @classmethod def create_class_map(cls): - return MorphologyClassMap(FEATURES, FIELDS) + return MorphologyClassMap(FEATURES) cpdef univ_pos_t get_int_tag(pos_): @@ -770,8 +786,8 @@ FIELDS = { 'Tense': Field_Tense, 'Typo': Field_Typo, 'VerbForm': Field_VerbForm, + 'VerbType': Field_VerbType, 'Voice': Field_Voice, - 'VerbType': Field_VerbType } LOWER_FIELDS = { @@ -803,8 +819,8 @@ LOWER_FIELDS = { 'part_form': Field_PartForm, 'part_type': Field_PartType, 'person': Field_Person, - 'polite': Field_Polite, 'polarity': Field_Polarity, + 'polite': Field_Polite, 'poss': Field_Poss, 'prefix': Field_Prefix, 'prep_case': Field_PrepCase, @@ -817,8 +833,8 @@ LOWER_FIELDS = { 'tense': Field_Tense, 'typo': Field_Typo, 'verb_form': Field_VerbForm, + 'verb_type': Field_VerbType, 'voice': Field_Voice, - 'verb_type': Field_VerbType } @@ -849,7 +865,7 @@ FEATURES = [ "AdpType_prep", "AdpType_post", "AdpType_voc", - "AdvType_adadj," + "AdvType_adadj", "AdvType_cau", "AdvType_deg", "AdvType_ex", diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index d3d850da0..b14e2bec7 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -86,20 +86,15 @@ class Morphologizer(Pipe): if doc_guesses[j, k] == 0: doc_feat_ids[j, k] = 0 else: - doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1) + doc_feat_ids[j, k] = offset + doc_guesses[j, k] # Get the set of feature names. - feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j] - if f != 0} + feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]} + if "NIL" in feats: + feats.remove("NIL") # Now add the analysis, and set the hash. - try: - doc.c[j].morph = self.vocab.morphology.add(feats) - if doc[j].morph.pos != 0: - doc.c[j].pos = doc[j].morph.pos - except: - print(offsets) - print(doc_guesses[j]) - print(doc_feat_ids[j]) - raise + doc.c[j].morph = self.vocab.morphology.add(feats) + if doc[j].morph.pos != 0: + doc.c[j].pos = doc[j].morph.pos def update(self, docs, golds, drop=0., sgd=None, losses=None): if losses is not None and self.name not in losses: @@ -126,23 +121,25 @@ class Morphologizer(Pipe): # Do this on CPU, as we can't vectorize easily. target = numpy.zeros(scores.shape, dtype='f') field_sizes = self.model.softmax.out_sizes - for gold in golds: - for features in gold.morphology: + for doc, gold in zip(docs, golds): + for t, features in enumerate(gold.morphology): if features is None: target[idx] = scores[idx] else: gold_fields = {} for feature in features: - field = self.get_field(feature) - column = self.get_column(feature) - gold_fields[field] = column - col_offset = 0 - for field, field_size in enumerate(field_sizes): - if field in gold_fields: - target[idx, col_offset + gold_fields[field]] = 1. + field = self._class_map.feat2field[feature] + gold_fields[field] = self._class_map.feat2offset[feature] + for field in self._class_map.fields: + field_id = self._class_map.field2id[field] + col_offset = self._class_map.field2col[field] + if field_id in gold_fields: + target[idx, col_offset + gold_fields[field_id]] = 1. else: target[idx, col_offset] = 1. - col_offset += field_size + #print(doc[t]) + #for col, info in enumerate(self._class_map.col2info): + # print(col, info, scores[idx, col], target[idx, col]) idx += 1 target = self.model.ops.asarray(target, dtype='f') scores = self.model.ops.asarray(scores, dtype='f') diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index fa90603bc..450497f3b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -418,6 +418,8 @@ class Tagger(Pipe): vocab.morphology.assign_tag_id(&doc.c[j], tag_id) if lemma != 0 and lemma != doc.c[j].lex.orth: doc.c[j].lemma = lemma + else: + doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] idx += 1 if tensors is not None and len(tensors): if isinstance(doc.tensor, numpy.ndarray) \