diff --git a/spacy/en.pxd b/spacy/en.pxd index 8ce023106..6887dbc08 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -5,6 +5,57 @@ from .tokens cimport Tokens from .tokens cimport TokenC +cpdef enum en_person_t: + NO_PERSON + FIRST + SECOND + THIRD + + +cpdef enum en_number_t: + NO_NUMBER + SINGULAR + PLURAL + MASS + CARDINAL + ORDINAL + + +cpdef enum en_gender_t: + NO_GENDER + MASCULINE + FEMININE + + +cpdef enum en_tenspect_t: + NO_TENSE + BASE_VERB + PRESENT + PAST + PASSIVE + ING + MODAL + + +cpdef enum en_case_t: + NO_CASE + NOMINATIVE + ACCUSATIVE + GENITIVE + DEMONYM + + +cpdef enum misc_t: + NO_MISC + COMPARATIVE + SUPERLATIVE + RELATIVE + NAME + URL + EMAIL + EMOTICON + + # Flags cpdef enum FlagID: IS_ALPHA diff --git a/spacy/en.pyx b/spacy/en.pyx index c0eb0368b..fa59ef933 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -35,6 +35,63 @@ from __future__ import unicode_literals cimport lang from .typedefs cimport flags_t import orth +from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB +from .tagger cimport X, PUNCT, EOL + + +POS_TAGS = { + 'NULL': (NO_TAG, {}), + 'EOL': (EOL, {}), + 'CC': (CONJ, {}), + 'CD': (NUM, {}), + 'DT': (DET, {}), + 'EX': (DET, {}), + 'FW': (X, {}), + 'IN': (ADP, {}), + 'JJ': (ADJ, {}), + 'JJR': (ADJ, {'misc': COMPARATIVE}), + 'JJS': (ADJ, {'misc': SUPERLATIVE}), + 'LS': (X, {}), + 'MD': (VERB, {'tenspect': MODAL}), + 'NN': (NOUN, {}), + 'NNS': (NOUN, {'number': PLURAL}), + 'NNP': (NOUN, {'misc': NAME}), + 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}), + 'PDT': (DET, {}), + 'POS': (PRT, {'case': GENITIVE}), + 'PRP': (NOUN, {}), + 'PRP$': (NOUN, {'case': GENITIVE}), + 'RB': (ADV, {}), + 'RBR': (ADV, {'misc': COMPARATIVE}), + 'RBS': (ADV, {'misc': SUPERLATIVE}), + 'RP': (PRT, {}), + 'SYM': (X, {}), + 'TO': (PRT, {}), + 'UH': (X, {}), + 'VB': (VERB, {}), + 'VBD': (VERB, {'tenspect': PAST}), + 'VBG': (VERB, {'tenspect': ING}), + 'VBN': (VERB, {'tenspect': PASSIVE}), + 'VBP': (VERB, {'tenspect': PRESENT}), + 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}), + 'WDT': (DET, {'misc': RELATIVE}), + 'WP': (PRON, {'misc': RELATIVE}), + 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}), + 'WRB': (ADV, {'misc': RELATIVE}), + '!': (PUNCT, {}), + '#': (PUNCT, {}), + '$': (PUNCT, {}), + "''": (PUNCT, {}), + "(": (PUNCT, {}), + ")": (PUNCT, {}), + "-LRB-": (PUNCT, {}), + "-RRB-": (PUNCT, {}), + ".": (PUNCT, {}), + ",": (PUNCT, {}), + "``": (PUNCT, {}), + ":": (PUNCT, {}), + "?": (PUNCT, {}), +} POS_TEMPLATES = ( @@ -91,19 +148,25 @@ cdef class English(Language): def set_pos(self, Tokens tokens): cdef int i cdef atom_t[N_CONTEXT_FIELDS] context + cdef TokenC* t = tokens.data for i in range(tokens.length): - fill_pos_context(context, i, tokens.data) - tokens.data[i].pos = self.pos_tagger.predict(context) + fill_pos_context(context, i, t) + t[i].pos = self.pos_tagger.predict(context) + #self.morphalyser.set_token(&t[i]) def train_pos(self, Tokens tokens, golds): cdef int i cdef atom_t[N_CONTEXT_FIELDS] context c = 0 + cdef TokenC* t = tokens.data for i in range(tokens.length): - fill_pos_context(context, i, tokens.data) - tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]]) - c += tokens.data[i].pos == golds[i] + fill_pos_context(context, i, t) + t[i].pos = self.pos_tagger.predict(context, [golds[i]]) + t[i].morph = self.pos_tagger.tags[t[i].pos].morph + #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex) + c += t[i].pos == golds[i] return c + EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 20374f40d..124281a6b 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -2,20 +2,20 @@ from libcpp.vector cimport vector from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER -from preshed.maps cimport PreshMap +from preshed.maps cimport PreshMap, PreshMapArray from cymem.cymem cimport Pool from .typedefs cimport hash_t from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme from .tagger cimport Tagger +from .tagger cimport PosTag from .utf8string cimport StringStore, UniStr cdef class Lexicon: cpdef public get_lex_props cdef Pool mem - cpdef readonly size_t size cpdef readonly StringStore strings cdef vector[Lexeme*] lexemes @@ -29,13 +29,17 @@ cdef class Language: cdef readonly unicode name cdef PreshMap _cache cdef PreshMap _specials + cdef PreshMapArray _lemmas cpdef readonly Lexicon lexicon cpdef readonly Tagger pos_tagger + cpdef readonly object lemmatizer cdef object _prefix_re cdef object _suffix_re cdef object _infix_re + cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1 + cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 496c6742c..fdeb7df66 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 from preshed.maps cimport PreshMap +from .lemmatizer import Lemmatizer from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME @@ -26,6 +27,8 @@ from . import util from .util import read_lang_data from .tokens import Tokens +from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS + cdef class Language: def __init__(self, name): @@ -39,14 +42,40 @@ cdef class Language: self._infix_re = re.compile(infix) self.lexicon = Lexicon(self.get_props) self._load_special_tokenization(rules) + self._lemmas = PreshMapArray(N_UNIV_TAGS) self.pos_tagger = None + self.lemmatizer = None def load(self): + self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet')) self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) + cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1: + if self.lemmatizer is None: + return lex.sic + if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ: + return lex.sic + cdef int lemma = self._lemmas.get(pos.pos, lex.sic) + if lemma != 0: + return lemma + cdef bytes py_string = self.lexicon.strings[lex.sic] + cdef set lemma_strings + cdef bytes lemma_string + if pos.pos == NOUN: + lemma_strings = self.lemmatizer.noun(py_string) + elif pos.pos == VERB: + lemma_strings = self.lemmatizer.verb(py_string) + else: + assert pos.pos == ADJ + lemma_strings = self.lemmatizer.adj(py_string) + lemma_string = sorted(lemma_strings)[0] + lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i + self._lemmas.set(pos.pos, lex.sic, lemma) + return lemma + cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) cdef Tokens tokens = Tokens(self.lexicon.strings, length) @@ -254,9 +283,11 @@ cdef class Lexicon: self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) - self.size = 2 self.get_lex_props = get_props + def __len__(self): + return self.lexemes.size() + cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool @@ -269,14 +300,13 @@ cdef class Lexicon: mem = self.mem cdef unicode py_string = string.chars[:string.n] lex = mem.alloc(sizeof(Lexeme), 1) - lex[0] = lexeme_init(self.size, py_string, string.key, self.strings, + lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings, self.get_lex_props(py_string)) if mem is self.mem: self._map.set(string.key, lex) while self.lexemes.size() < (lex.id + 1): self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes[lex.id] = lex - self.size += 1 else: lex[0].id = 1 return lex @@ -302,6 +332,8 @@ cdef class Lexicon: a dict if the operator is called from Python. ''' if type(id_or_string) == int: + if id_or_string >= self.lexemes.size(): + raise IndexError return self.lexemes.at(id_or_string)[0] cdef UniStr string slice_unicode(&string, id_or_string, 0, len(id_or_string)) @@ -359,5 +391,4 @@ cdef class Lexicon: self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes[lexeme.id] = lexeme i += 1 - self.size += 1 fclose(fp) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index a42a5daee..ce9bbefdc 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -53,6 +53,7 @@ class Lemmatizer(object): def lemmatize(string, index, exceptions, rules): + string = string.lower() forms = [] if string in index: forms.append(string) @@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules): form = string[:len(string) - len(old)] + new if form in index: forms.append(form) + if not forms: + forms.append(string) return set(forms) diff --git a/spacy/pos_util.py b/spacy/pos_util.py index e5716665e..489f03dde 100644 --- a/spacy/pos_util.py +++ b/spacy/pos_util.py @@ -147,6 +147,7 @@ Y PRT Z NOUN ^ NOUN ~ X -`` .""".strip().split('\n')) +`` . +EOL EOL""".strip().split('\n')) return mapping[tag] diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index f91bbeb0a..11880bf13 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -1,11 +1,40 @@ +from libc.stdint cimport uint8_t + from cymem.cymem cimport Pool from thinc.learner cimport LinearModel from thinc.features cimport Extractor from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t +from preshed.maps cimport PreshMapArray + from .typedefs cimport hash_t -from .tokens cimport Tokens +from .tokens cimport Tokens, Morphology + + +# Google universal tag set +cdef enum univ_tag_t: + NO_TAG + ADJ + ADV + ADP + CONJ + DET + NOUN + NUM + PRON + PRT + VERB + X + PUNCT + EOL + N_UNIV_TAGS + + +cdef struct PosTag: + Morphology morph + int id + univ_tag_t pos cdef class Tagger: @@ -16,4 +45,5 @@ cdef class Tagger: cpdef readonly LinearModel model cpdef readonly list tag_names + cdef PosTag* tags cdef dict tagdict diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 22ec3896a..db7974d91 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -12,13 +12,14 @@ import cython from thinc.features cimport Feature, count_feats -def setup_model_dir(tag_names, tag_counts, templates, model_dir): +def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) config = { 'templates': templates, 'tag_names': tag_names, + 'tag_map': tag_map, 'tag_counts': tag_counts, } with open(path.join(model_dir, 'config.json'), 'w') as file_: @@ -33,16 +34,31 @@ cdef class Tagger: self.mem = Pool() cfg = json.load(open(path.join(model_dir, 'config.json'))) templates = cfg['templates'] + tag_map = cfg['tag_map'] + univ_counts = {} + cdef unicode tag + cdef unicode univ_tag self.tag_names = cfg['tag_names'] + self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) + for i, tag in enumerate(self.tag_names): + pos, props = tag_map[tag] + self.tags[i].id = i + self.tags[i].pos = pos + self.tags[i].morph.number = props.get('number', 0) + self.tags[i].morph.tenspect = props.get('tenspect', 0) + self.tags[i].morph.mood = props.get('mood', 0) + self.tags[i].morph.gender = props.get('gender', 0) + self.tags[i].morph.person = props.get('person', 0) + self.tags[i].morph.case = props.get('case', 0) + self.tags[i].morph.misc = props.get('misc', 0) self.tagdict = _make_tag_dict(cfg['tag_counts']) self.extractor = Extractor(templates) self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) - cdef class_t predict(self, const atom_t* context, object golds=None) except *: - """Predict the tag of tokens[i]. The tagger remembers the features and - prediction, in case you later call tell_answer. + cdef class_t predict(self, atom_t* context, object golds=None) except *: + """Predict the tag of tokens[i]. >>> tokens = EN.tokenize(u'An example sentence.') >>> tag = EN.pos_tagger.predict(0, tokens) @@ -69,6 +85,24 @@ cdef class Tagger: return tag_id +UNIV_TAGS = { + 'NULL': NO_TAG, + 'ADJ': ADJ, + 'ADV': ADV, + 'ADP': ADP, + 'CONJ': CONJ, + 'DET': DET, + 'NOUN': NOUN, + 'NUM': NUM, + 'PRON': PRON, + 'PRT': PRT, + 'VERB': VERB, + 'X': X, + '.': PUNCT, + 'EOL': EOL +} + + def _make_tag_dict(counts): freq_thresh = 50 ambiguity_thresh = 0.98 diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index e6bc0a46a..6f4691716 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -5,14 +5,29 @@ from cymem.cymem cimport Pool from thinc.typedefs cimport atom_t from .lexeme cimport Lexeme + from .typedefs cimport flags_t from .utf8string cimport StringStore +from libc.stdint cimport uint8_t, uint16_t + + +cdef struct Morphology: + uint8_t number + uint8_t tenspect # Tense/aspect/voice + uint8_t mood + uint8_t gender + uint8_t person + uint8_t case + uint8_t misc + cdef struct TokenC: const Lexeme* lex + Morphology morph int idx int pos + int lemma int sense @@ -37,7 +52,7 @@ cdef class Token: cdef public int i cdef public int idx cdef public int pos - cdef public int ner + cdef int lemma cdef public atom_t id cdef public atom_t cluster diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 33f265eef..004d0578c 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -51,7 +51,7 @@ cdef class Tokens: def __getitem__(self, i): bounds_check(i, self.length, PADDING) return Token(self._string_store, i, self.data[i].idx, self.data[i].pos, - self.data[i].sense, self.data[i].lex[0]) + self.data[i].lemma, self.data[i].lex[0]) def __iter__(self): for i in range(self.length): @@ -128,14 +128,15 @@ cdef class Tokens: @cython.freelist(64) cdef class Token: - def __init__(self, StringStore string_store, int i, int idx, int pos, int ner, + def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma, dict lex): self._string_store = string_store self.idx = idx self.pos = pos - self.ner = ner self.i = i self.id = lex['id'] + + self.lemma = lemma self.cluster = lex['cluster'] self.length = lex['length'] @@ -156,3 +157,10 @@ cdef class Token: return '' cdef bytes utf8string = self._string_store[self.sic] return utf8string.decode('utf8') + + property lemma: + def __get__(self): + if self.lemma == 0: + return self.string + cdef bytes utf8string = self._string_store[self.lemma] + return utf8string.decode('utf8')