mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Generalize tagger code, in preparation for NER and supersense tagging.
This commit is contained in:
		
							parent
							
								
									81da61f3cf
								
							
						
					
					
						commit
						3733444101
					
				|  | @ -6,7 +6,7 @@ from cymem.cymem cimport Pool | |||
| from .typedefs cimport hash_t | ||||
| from .tokens cimport Tokens | ||||
| from .lexeme cimport Lexeme | ||||
| from .pos cimport Tagger as PosTagger | ||||
| from .tagger cimport Tagger | ||||
| from .utf8string cimport StringStore | ||||
| 
 | ||||
| 
 | ||||
|  | @ -41,14 +41,13 @@ cdef class Language: | |||
|     cdef PreshMap _specials | ||||
|     cpdef readonly Lexicon lexicon | ||||
| 
 | ||||
|     cpdef readonly PosTagger pos_tagger | ||||
|     cpdef readonly Tagger pos_tagger | ||||
| 
 | ||||
|     cdef object _prefix_re | ||||
|     cdef object _suffix_re | ||||
|     cdef object _infix_re | ||||
| 
 | ||||
|     cpdef Tokens tokenize(self, unicode text) | ||||
|     cpdef Tokens pos_tag(self, Tokens t) | ||||
| 
 | ||||
|     cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 | ||||
|     cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, | ||||
|  |  | |||
|  | @ -23,7 +23,7 @@ from . import util | |||
| from .util import read_lang_data | ||||
| from .tokens import Tokens | ||||
| 
 | ||||
| from .pos cimport Tagger as PosTagger | ||||
| from .tagger cimport Tagger | ||||
| 
 | ||||
| 
 | ||||
| cdef class Language: | ||||
|  | @ -42,7 +42,7 @@ cdef class Language: | |||
|             self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) | ||||
|         self._load_special_tokenization(rules) | ||||
|         if path.exists(path.join(util.DATA_DIR, name, 'pos')): | ||||
|             self.pos_tagger = PosTagger(path.join(util.DATA_DIR, name, 'pos')) | ||||
|             self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos')) | ||||
|         else: | ||||
|             self.pos_tagger = None | ||||
| 
 | ||||
|  | @ -93,16 +93,6 @@ cdef class Language: | |||
|                 self._tokenize(tokens, &span, start, i) | ||||
|         return tokens | ||||
| 
 | ||||
|     cpdef Tokens pos_tag(self, Tokens t): | ||||
|         if self.pos_tagger is None: | ||||
|             return t | ||||
|         cdef int i | ||||
|         t.pos[-1] = self.pos_tagger.encode_pos('EOL') | ||||
|         t.pos[-2] = self.pos_tagger.encode_pos('EOL') | ||||
|         for i in range(t.length): | ||||
|             t.pos[i] = self.pos_tagger.predict(i, t, t.pos[i-1], t.pos[i-2]) | ||||
|         return t | ||||
| 
 | ||||
|     cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: | ||||
|         cdef vector[Lexeme*] prefixes | ||||
|         cdef vector[Lexeme*] suffixes | ||||
|  |  | |||
|  | @ -1,22 +0,0 @@ | |||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| from thinc.learner cimport LinearModel | ||||
| from thinc.features cimport Extractor | ||||
| from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t | ||||
| 
 | ||||
| from .tokens cimport Tokens | ||||
| 
 | ||||
| 
 | ||||
| cdef class Tagger: | ||||
|     cpdef readonly Extractor extractor | ||||
|     cpdef readonly LinearModel model | ||||
| 
 | ||||
|     cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0 | ||||
|     cpdef bint tell_answer(self, class_t gold_tag) except * | ||||
|   | ||||
|     cdef Pool mem | ||||
|     cdef class_t _guess | ||||
|     cdef atom_t* _atoms | ||||
|     cdef feat_t* _feats | ||||
|     cdef weight_t* _values | ||||
|     cdef weight_t* _scores | ||||
|  | @ -30,7 +30,7 @@ cdef class Tagger: | |||
|         if path.exists(tags_loc): | ||||
|             with open(tags_loc) as file_: | ||||
|                 Tagger.tags.update(ujson.load(file_)) | ||||
|         self.model = LinearModel(len(self.tags), self.extractor.n) | ||||
|         self.model = LinearModel(len(self.tags)) | ||||
|         if path.exists(path.join(model_dir, 'model')): | ||||
|             self.model.load(path.join(model_dir, 'model')) | ||||
|         self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES]) | ||||
|  |  | |||
							
								
								
									
										83
									
								
								spacy/pos_feats.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										83
									
								
								spacy/pos_feats.pxd
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,83 @@ | |||
| from .tokens cimport Tokens | ||||
| from thinc.typedefs cimport atom_t | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum: | ||||
|     P2i | ||||
|     P2c | ||||
|     P2w | ||||
|     P2shape | ||||
|     P2pref | ||||
|     P2suff | ||||
|     P2title | ||||
|     P2upper | ||||
|     P2oft_title | ||||
|     P2oft_upper | ||||
|     P2pos | ||||
|     P2url | ||||
|     P2num | ||||
| 
 | ||||
|     P1i | ||||
|     P1c | ||||
|     P1w | ||||
|     P1shape | ||||
|     P1pre | ||||
|     P1suff | ||||
|     P1title | ||||
|     P1upper | ||||
|     P1oft_title | ||||
|     P1oft_upper | ||||
|     P1pos | ||||
|     P1url | ||||
|     P1num | ||||
| 
 | ||||
|     N0i | ||||
|     N0c | ||||
|     N0w | ||||
|     N0shape | ||||
|     N0pref | ||||
|     N0suff | ||||
|     N0title | ||||
|     N0upper | ||||
|     N0oft_title | ||||
|     N0oft_upper | ||||
|     N0pos | ||||
|     N0url | ||||
|     N0num | ||||
| 
 | ||||
|     N1i | ||||
|     N1c | ||||
|     N1w | ||||
|     N1shape | ||||
|     N1pref | ||||
|     N1suff | ||||
|     N1title | ||||
|     N1upper | ||||
|     N1oft_title | ||||
|     N1oft_upper | ||||
|     N1pos | ||||
|     N1url | ||||
|     N1num | ||||
| 
 | ||||
|     N2i | ||||
|     N2c | ||||
|     N2w | ||||
|     N2shape | ||||
|     N2pref | ||||
|     N2suff | ||||
|     N2title | ||||
|     N2upper | ||||
|     N2oft_title | ||||
|     N2oft_upper | ||||
|     N2pos | ||||
|     N2url | ||||
|     N2num | ||||
| 
 | ||||
|     P2t | ||||
|     P1t | ||||
| 
 | ||||
|     CONTEXT_SIZE | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1 | ||||
							
								
								
									
										77
									
								
								spacy/pos_feats.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								spacy/pos_feats.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,77 @@ | |||
| from .lexeme cimport * | ||||
| 
 | ||||
| from thinc.typedefs cimport atom_t | ||||
| 
 | ||||
| 
 | ||||
| TEMPLATES = ( | ||||
|     (N0i,), | ||||
|     (N0w,), | ||||
|     (N0suff,), | ||||
|     (N0pref,), | ||||
|     (P1t,), | ||||
|     (P2t,), | ||||
|     (P1t, P2t), | ||||
|     (P1t, N0w), | ||||
|     (P1w,), | ||||
|     (P1suff,), | ||||
|     (P2w,), | ||||
|     (N1w,), | ||||
|     (N1suff,), | ||||
|     (N2w,), | ||||
| 
 | ||||
|     (N0shape,), | ||||
|     (N0c,), | ||||
|     (N1c,), | ||||
|     (N2c,), | ||||
|     (P1c,), | ||||
|     (P2c,), | ||||
|     (P1c, N0c), | ||||
|     (N0c, N1c), | ||||
|     (P1c, P1t), | ||||
|     (P1c, P1t, N0c), | ||||
|     (P1t, N0c), | ||||
|     (N0oft_upper,), | ||||
|     (N0oft_title,), | ||||
| 
 | ||||
|     (P1w, N0w), | ||||
|     (N0w, N1w), | ||||
| 
 | ||||
|     (N0pos,), | ||||
|     (P1t, N0pos, N1pos), | ||||
|     (P1t, N1pos), | ||||
| 
 | ||||
|     (N0url,), | ||||
|     (N0num,), | ||||
|     (P1url,), | ||||
|     (P1url,), | ||||
|     (N1num,), | ||||
|     (N1url,), | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1: | ||||
|     _fill_token(&context[P2i], tokens.lex[i-2]) | ||||
|     _fill_token(&context[P1i], tokens.lex[i-1]) | ||||
|     _fill_token(&context[N0i], tokens.lex[i]) | ||||
|     _fill_token(&context[N1i], tokens.lex[i+1]) | ||||
|     _fill_token(&context[N2i], tokens.lex[i+2]) | ||||
|     context[P1t] = tokens.pos[i-1] | ||||
|     context[P2t] = tokens.pos[i-2] | ||||
| 
 | ||||
| 
 | ||||
| cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil: | ||||
|     atoms[0] = lex.sic | ||||
|     atoms[1] = lex.cluster | ||||
|     atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape | ||||
|     atoms[3] = lex.shape | ||||
|     atoms[4] = lex.prefix | ||||
|     atoms[5] = lex.suffix | ||||
| 
 | ||||
|     atoms[6] = lex.flags & (1 << IS_TITLE) | ||||
|     atoms[7] = lex.flags & (1 << IS_UPPER) | ||||
|     atoms[8] = lex.flags & (1 << OFT_TITLE) | ||||
|     atoms[9] = lex.flags & (1 << OFT_UPPER) | ||||
|     atoms[10] = lex.postype | ||||
|     atoms[11] = lex.flags & (1 << LIKE_URL) | ||||
|     atoms[12] = lex.flags & (1 << LIKE_NUMBER) | ||||
| 
 | ||||
|  | @ -6,9 +6,10 @@ from .en import EN | |||
| from .pos import Tagger | ||||
| 
 | ||||
| 
 | ||||
| def read_gold(file_): | ||||
| def read_gold(file_, tag_list): | ||||
|     paras = file_.read().strip().split('\n\n') | ||||
|     golds = [] | ||||
|     tag_ids = dict((tag, i) for i, tag in enumerate(tag_list)) | ||||
|     for para in paras: | ||||
|         if not para.strip(): | ||||
|             continue | ||||
|  | @ -32,10 +33,16 @@ def read_gold(file_): | |||
|             else: | ||||
|                 conll_toks.pop(0) | ||||
|         assert len(tags) == len(tokens) | ||||
|         tags = [Tagger.encode_pos(t) for t in tags] | ||||
|         tags = [_encode_pos(t, tag_ids, tag_list) for t in tags] | ||||
|         golds.append((tokens, tags)) | ||||
|     return golds | ||||
| 
 | ||||
| def _encode_pos(tag, tag_ids, tag_list): | ||||
|     if tag not in tag_ids: | ||||
|         tag_ids[tag] = len(tag_list) | ||||
|         tag_list.append(tag) | ||||
|     return tag_ids[tag] | ||||
| 
 | ||||
| 
 | ||||
| def ptb_to_univ(tag): | ||||
|     mapping = dict(tuple(line.split()) for line in """ | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t | |||
| from .tokens cimport Tokens | ||||
| 
 | ||||
| 
 | ||||
| cdef enum TagType: | ||||
| cpdef enum TagType: | ||||
|     POS | ||||
|     ENTITY | ||||
|     SENSE | ||||
|  |  | |||
|  | @ -1,37 +1,93 @@ | |||
| # cython: profile=True | ||||
| from __future__ import print_function | ||||
| from __future__ import unicode_literals | ||||
| from __future__ import division | ||||
| 
 | ||||
| from os import path | ||||
| import os | ||||
| import shutil | ||||
| import random | ||||
| import codecs | ||||
| import gzip | ||||
| import json | ||||
| import cython | ||||
| 
 | ||||
| from .pos_feats cimport fill_context as pos_fill_context | ||||
| from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE | ||||
| 
 | ||||
| from thinc.features cimport ConjFeat | ||||
| 
 | ||||
| 
 | ||||
| NULL_TAG = 0 | ||||
| 
 | ||||
| 
 | ||||
| def setup_model_dir(tag_type, tag_names, templates, model_dir): | ||||
|     if path.exists(model_dir): | ||||
|         shutil.rmtree(model_dir) | ||||
|     os.mkdir(model_dir) | ||||
|     config = { | ||||
|         'tag_type': tag_type, | ||||
|         'templates': templates, | ||||
|         'tag_names': tag_names, | ||||
|     } | ||||
|     with open(path.join(model_dir, 'config.json'), 'w') as file_: | ||||
|         json.dump(config, file_) | ||||
| 
 | ||||
| 
 | ||||
| def train(train_sents, model_dir, nr_iter=5): | ||||
|     tagger = Tagger(model_dir) | ||||
|     for _ in range(nr_iter): | ||||
|         n_corr = 0 | ||||
|         total = 0 | ||||
|         for tokens, golds in train_sents: | ||||
|             assert len(tokens) == len(golds), [t.string for t in tokens] | ||||
|             for i, gold in enumerate(golds): | ||||
|                 guess = tagger.predict(i, tokens) | ||||
|                 tokens.set_tag(i, tagger.tag_type, guess) | ||||
|                 tagger.tell_answer(gold) | ||||
|                 if gold != NULL_TAG: | ||||
|                     total += 1 | ||||
|                     n_corr += guess == gold | ||||
|         print('%.4f' % ((n_corr / total) * 100)) | ||||
|         random.shuffle(train_sents) | ||||
|     tagger.model.end_training() | ||||
|     tagger.model.dump(path.join(model_dir, 'model'), freq_thresh=10) | ||||
| 
 | ||||
| 
 | ||||
| def evaluate(tagger, sents): | ||||
|     n_corr = 0 | ||||
|     total = 0 | ||||
|     for tokens, golds in sents: | ||||
|         for i, gold in enumerate(golds): | ||||
|             guess = tagger.predict(i, tokens) | ||||
|             tokens.set_tag(i, tagger.tag_type, guess) | ||||
|             if gold != NULL_TAG: | ||||
|                 total += 1 | ||||
|                 n_corr += guess == gold | ||||
|     return n_corr / total | ||||
| 
 | ||||
| 
 | ||||
| cdef class Tagger: | ||||
|     """Assign part-of-speech, named entity or supersense tags, using greedy | ||||
|     decoding.  The tagger reads its model and configuration from disk. | ||||
|     """ | ||||
|     def __init__(self, model_dir): | ||||
|         self.mem = Pool() | ||||
|         cfg = json.load(path.join(model_dir, 'config.json')) | ||||
|         cfg = json.load(open(path.join(model_dir, 'config.json'))) | ||||
|         templates = cfg['templates'] | ||||
|         self.tag_names = cfg['tag_names'] | ||||
|         self.tag_type = cfg['tag_type'] | ||||
|         self.model = LinearModel(len(self.tag_names)) | ||||
|         self.extractor = Extractor(templates, [ConjFeat] * len(templates)) | ||||
|         self.model = LinearModel(len(self.tag_names), self.extractor.n) | ||||
|         print("Load tagger model") | ||||
|         if path.exists(path.join(model_dir, 'model')): | ||||
|             self.model.load(path.join(model_dir, 'model')) | ||||
|         self.extractor = Extractor(templates, [ConjFeat] * len(templates)) | ||||
|         print("Done") | ||||
| 
 | ||||
|         if self.tag_type == POS: | ||||
|             n_context = POS_CONTEXT_SIZE | ||||
|         self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t)) | ||||
|         self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) | ||||
|         self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) | ||||
|         self._scores = <weight_t*>self.mem.alloc(len(self.cfg.tags), sizeof(weight_t)) | ||||
|         self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t)) | ||||
|         self._guess = NULL_TAG | ||||
| 
 | ||||
|     cpdef int set_tags(self, Tokens tokens) except -1: | ||||
|  | @ -54,8 +110,10 @@ cdef class Tagger: | |||
|         >>> tag = EN.pos_tagger.predict(0, tokens) | ||||
|         >>> assert tag == EN.pos_tagger.tag_id('DT') == 5 | ||||
|         """ | ||||
|         #if self.tag_type == POS: | ||||
|         #    _pos_feats.fill_context(self._context, i, tokens) | ||||
|         if self.tag_type == POS: | ||||
|             pos_fill_context(self._context, i, tokens) | ||||
|         else: | ||||
|             raise StandardError | ||||
|         self.extractor.extract(self._feats, self._values, self._context, NULL) | ||||
|         self._guess = self.model.score(self._scores, self._feats, self._values) | ||||
|         return self._guess | ||||
|  |  | |||
|  | @ -3,6 +3,7 @@ from cymem.cymem cimport Pool | |||
| from .lexeme cimport Lexeme | ||||
| from .typedefs cimport flag_t | ||||
| from .utf8string cimport StringStore | ||||
| from .tagger cimport TagType | ||||
| 
 | ||||
| from thinc.typedefs cimport atom_t | ||||
| 
 | ||||
|  | @ -23,6 +24,7 @@ cdef class Tokens: | |||
| 
 | ||||
|     cdef int extend(self, int i, Lexeme** lexemes, int n) except -1 | ||||
|     cdef int push_back(self, int i, Lexeme* lexeme) except -1 | ||||
|     cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1 | ||||
| 
 | ||||
| 
 | ||||
| cdef class Token: | ||||
|  |  | |||
|  | @ -4,6 +4,7 @@ cimport cython | |||
| 
 | ||||
| DEF PADDING = 5 | ||||
| 
 | ||||
| 
 | ||||
| cdef int bounds_check(int i, int length, int padding) except -1: | ||||
|     if (i + padding) < 0: | ||||
|         raise IndexError | ||||
|  | @ -89,6 +90,9 @@ cdef class Tokens: | |||
|                 idx = self.push_back(idx, lexemes[i]) | ||||
|         return idx | ||||
| 
 | ||||
|     cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1: | ||||
|         self.pos[i] = tag | ||||
| 
 | ||||
|     def _realloc(self, new_size): | ||||
|         self.max_length = new_size | ||||
|         n = new_size + (PADDING * 2) | ||||
|  | @ -130,4 +134,3 @@ cdef class Token: | |||
|                 return '' | ||||
|             cdef bytes utf8string = self._string_store[self.sic] | ||||
|             return utf8string.decode('utf8') | ||||
| 
 | ||||
|  |  | |||
|  | @ -6,5 +6,3 @@ ctypedef uint64_t flag_t | |||
| ctypedef uint32_t id_t | ||||
| ctypedef uint16_t len_t | ||||
| ctypedef uint16_t tag_t | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user