mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Refactor morphologizer
This commit is contained in:
		
							parent
							
								
									41a3016019
								
							
						
					
					
						commit
						0f12082465
					
				|  | @ -561,7 +561,7 @@ def build_morphologizer_model(class_nums, **cfg): | |||
|         token_vector_width = util.env_opt("token_vector_width", 128) | ||||
|     pretrained_vectors = cfg.get("pretrained_vectors") | ||||
|     char_embed = cfg.get("char_embed", True) | ||||
|     with Model.define_operators({">>": chain, "+": add}): | ||||
|     with Model.define_operators({">>": chain, "+": add, "**": clone}): | ||||
|         if "tok2vec" in cfg: | ||||
|             tok2vec = cfg["tok2vec"] | ||||
|         else: | ||||
|  | @ -571,7 +571,9 @@ def build_morphologizer_model(class_nums, **cfg): | |||
|                 char_embed=char_embed, | ||||
|                 pretrained_vectors=pretrained_vectors, | ||||
|             ) | ||||
|         softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) | ||||
|         softmax = with_flatten( | ||||
|             MultiSoftmax(class_nums, token_vector_width) | ||||
|         ) | ||||
|         softmax.out_sizes = class_nums | ||||
|         model = tok2vec >> softmax | ||||
|     model.nI = None | ||||
|  |  | |||
|  | @ -46,8 +46,8 @@ cdef enum univ_field_t: | |||
|     Field_PartForm | ||||
|     Field_PartType | ||||
|     Field_Person | ||||
|     Field_Polite | ||||
|     Field_Polarity | ||||
|     Field_Polite | ||||
|     Field_Poss | ||||
|     Field_Prefix | ||||
|     Field_PrepCase | ||||
|  | @ -60,8 +60,8 @@ cdef enum univ_field_t: | |||
|     Field_Tense | ||||
|     Field_Typo | ||||
|     Field_VerbForm | ||||
|     Field_Voice | ||||
|     Field_VerbType | ||||
|     Field_Voice | ||||
| 
 | ||||
| 
 | ||||
| def _normalize_props(props): | ||||
|  | @ -94,20 +94,36 @@ def _normalize_props(props): | |||
| 
 | ||||
| 
 | ||||
| class MorphologyClassMap(object): | ||||
|     def __init__(self, features, fields): | ||||
|     def __init__(self, features): | ||||
|         self.features = tuple(features) | ||||
|         self.fields = tuple(fields) | ||||
|         self.fields = [] | ||||
|         self.feat2field = {} | ||||
|         seen_fields = set() | ||||
|         for feature in features: | ||||
|             field = feature.split("_", 1)[0] | ||||
|             if field not in seen_fields: | ||||
|                 self.fields.append(field) | ||||
|                 seen_fields.add(field) | ||||
|             self.feat2field[feature] = FIELDS[field] | ||||
|         self.id2feat = {get_string_id(name): name for name in features} | ||||
|         self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features} | ||||
|         self.field2feats = {} | ||||
|         self.field2feats = {"POS": []} | ||||
|         self.col2info = [] | ||||
|         self.attr2field = dict(LOWER_FIELDS.items()) | ||||
|         self.feat2offset = {} | ||||
|         self.field2col = {} | ||||
|         self.field2id = dict(FIELDS.items()) | ||||
|         self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()} | ||||
|         for feature in features: | ||||
|             field = self.feat2field[feature] | ||||
|             if field not in self.field2feats: | ||||
|                 self.col2info.append((field, 0, 'NIL')) | ||||
|             self.field2feats.setdefault(field, []).append(feature) | ||||
|             self.col2info.append((field, len(self.field2feats[field]), feature)) | ||||
|             field = self.fields[self.feat2field[feature]] | ||||
|             if field not in self.field2col: | ||||
|                 self.field2col[field] = len(self.col2info) | ||||
|             if field != "POS" and field not in self.field2feats: | ||||
|                 self.col2info.append((field, 0, "NIL")) | ||||
|             self.field2feats.setdefault(field, ["NIL"]) | ||||
|             offset = len(self.field2feats[field]) | ||||
|             self.field2feats[field].append(feature) | ||||
|             self.col2info.append((field, offset, feature)) | ||||
|             self.feat2offset[feature] = offset | ||||
| 
 | ||||
|     @property | ||||
|     def field_sizes(self): | ||||
|  | @ -147,7 +163,7 @@ cdef class Morphology: | |||
|         self.lemmatizer = lemmatizer | ||||
|         self.n_tags = len(tag_map) | ||||
|         self.reverse_index = {} | ||||
|         self._feat_map = MorphologyClassMap(FEATURES, FIELDS) | ||||
|         self._feat_map = MorphologyClassMap(FEATURES) | ||||
|         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): | ||||
|             attrs = _normalize_props(attrs) | ||||
|             self.add({self._feat_map.id2feat[feat] for feat in attrs | ||||
|  | @ -326,7 +342,7 @@ cdef class Morphology: | |||
| 
 | ||||
|     @classmethod | ||||
|     def create_class_map(cls): | ||||
|         return MorphologyClassMap(FEATURES, FIELDS) | ||||
|         return MorphologyClassMap(FEATURES) | ||||
| 
 | ||||
| 
 | ||||
| cpdef univ_pos_t get_int_tag(pos_): | ||||
|  | @ -770,8 +786,8 @@ FIELDS = { | |||
|     'Tense': Field_Tense, | ||||
|     'Typo': Field_Typo, | ||||
|     'VerbForm': Field_VerbForm, | ||||
|     'VerbType': Field_VerbType, | ||||
|     'Voice': Field_Voice, | ||||
|     'VerbType': Field_VerbType | ||||
| } | ||||
| 
 | ||||
| LOWER_FIELDS = { | ||||
|  | @ -803,8 +819,8 @@ LOWER_FIELDS = { | |||
|     'part_form': Field_PartForm, | ||||
|     'part_type': Field_PartType, | ||||
|     'person': Field_Person, | ||||
|     'polite': Field_Polite, | ||||
|     'polarity': Field_Polarity, | ||||
|     'polite': Field_Polite, | ||||
|     'poss': Field_Poss, | ||||
|     'prefix': Field_Prefix, | ||||
|     'prep_case': Field_PrepCase, | ||||
|  | @ -817,8 +833,8 @@ LOWER_FIELDS = { | |||
|     'tense': Field_Tense, | ||||
|     'typo': Field_Typo, | ||||
|     'verb_form': Field_VerbForm, | ||||
|     'verb_type': Field_VerbType, | ||||
|     'voice': Field_Voice, | ||||
|     'verb_type': Field_VerbType | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
|  | @ -849,7 +865,7 @@ FEATURES = [ | |||
|    "AdpType_prep", | ||||
|    "AdpType_post", | ||||
|    "AdpType_voc", | ||||
|    "AdvType_adadj," | ||||
|    "AdvType_adadj", | ||||
|    "AdvType_cau", | ||||
|    "AdvType_deg", | ||||
|    "AdvType_ex", | ||||
|  |  | |||
|  | @ -86,20 +86,15 @@ class Morphologizer(Pipe): | |||
|                     if doc_guesses[j, k] == 0: | ||||
|                         doc_feat_ids[j, k] = 0 | ||||
|                     else: | ||||
|                         doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1) | ||||
|                         doc_feat_ids[j, k] = offset + doc_guesses[j, k] | ||||
|                 # Get the set of feature names. | ||||
|                 feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j] | ||||
|                          if f != 0} | ||||
|                 feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]} | ||||
|                 if "NIL" in feats: | ||||
|                     feats.remove("NIL") | ||||
|                 # Now add the analysis, and set the hash. | ||||
|                 try: | ||||
|                     doc.c[j].morph = self.vocab.morphology.add(feats) | ||||
|                     if doc[j].morph.pos != 0: | ||||
|                         doc.c[j].pos = doc[j].morph.pos | ||||
|                 except: | ||||
|                     print(offsets) | ||||
|                     print(doc_guesses[j]) | ||||
|                     print(doc_feat_ids[j]) | ||||
|                     raise | ||||
|                 doc.c[j].morph = self.vocab.morphology.add(feats) | ||||
|                 if doc[j].morph.pos != 0: | ||||
|                     doc.c[j].pos = doc[j].morph.pos | ||||
| 
 | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||
|         if losses is not None and self.name not in losses: | ||||
|  | @ -126,23 +121,25 @@ class Morphologizer(Pipe): | |||
|         # Do this on CPU, as we can't vectorize easily. | ||||
|         target = numpy.zeros(scores.shape, dtype='f') | ||||
|         field_sizes = self.model.softmax.out_sizes | ||||
|         for gold in golds: | ||||
|             for features in gold.morphology: | ||||
|         for doc, gold in zip(docs, golds): | ||||
|             for t, features in enumerate(gold.morphology): | ||||
|                 if features is None: | ||||
|                     target[idx] = scores[idx] | ||||
|                 else: | ||||
|                     gold_fields = {} | ||||
|                     for feature in features: | ||||
|                         field = self.get_field(feature) | ||||
|                         column = self.get_column(feature) | ||||
|                         gold_fields[field] = column | ||||
|                     col_offset = 0 | ||||
|                     for field, field_size in enumerate(field_sizes): | ||||
|                         if field in gold_fields: | ||||
|                             target[idx, col_offset + gold_fields[field]] = 1. | ||||
|                         field = self._class_map.feat2field[feature] | ||||
|                         gold_fields[field] = self._class_map.feat2offset[feature] | ||||
|                     for field in self._class_map.fields: | ||||
|                         field_id = self._class_map.field2id[field] | ||||
|                         col_offset = self._class_map.field2col[field] | ||||
|                         if field_id in gold_fields: | ||||
|                             target[idx, col_offset + gold_fields[field_id]] = 1. | ||||
|                         else: | ||||
|                             target[idx, col_offset] = 1. | ||||
|                         col_offset += field_size | ||||
|                     #print(doc[t]) | ||||
|                     #for col, info in enumerate(self._class_map.col2info): | ||||
|                     #    print(col, info, scores[idx, col], target[idx, col]) | ||||
|                 idx += 1 | ||||
|         target = self.model.ops.asarray(target, dtype='f') | ||||
|         scores = self.model.ops.asarray(scores, dtype='f') | ||||
|  |  | |||
|  | @ -418,6 +418,8 @@ class Tagger(Pipe): | |||
|                         vocab.morphology.assign_tag_id(&doc.c[j], tag_id) | ||||
|                         if lemma != 0 and lemma != doc.c[j].lex.orth: | ||||
|                             doc.c[j].lemma = lemma | ||||
|                     else: | ||||
|                         doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] | ||||
|                 idx += 1 | ||||
|             if tensors is not None and len(tensors): | ||||
|                 if isinstance(doc.tensor, numpy.ndarray) \ | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user