diff --git a/spacy/en.pxd b/spacy/en.pxd deleted file mode 100644 index 66ccd64ce..000000000 --- a/spacy/en.pxd +++ /dev/null @@ -1,135 +0,0 @@ -from thinc.typedefs cimport atom_t - -from .lang cimport Language -from .tokens cimport Tokens -from .tokens cimport TokenC - - -cpdef enum en_person_t: - NO_PERSON - FIRST - SECOND - THIRD - NON_THIRD - - -cpdef enum en_number_t: - NO_NUMBER - SINGULAR - PLURAL - MASS - - -cpdef enum en_gender_t: - NO_GENDER - MASCULINE - FEMININE - NEUTER - - -cpdef enum en_case_t: - NO_CASE - NOMINATIVE - GENITIVE - ACCUSATIVE - REFLEXIVE - DEMONYM - - -cpdef enum en_tenspect_t: - NO_TENSE - BASE_VERB - PRESENT - PAST - PASSIVE - ING - MODAL - - -cpdef enum misc_t: - NO_MISC - COMPARATIVE - SUPERLATIVE - RELATIVE - NAME - - -# Flags -cpdef enum FlagID: - IS_ALPHA - IS_ASCII - IS_DIGIT - IS_LOWER - IS_PUNCT - IS_SPACE - IS_TITLE - IS_UPPER - - LIKE_URL - LIKE_NUMBER - - OFT_LOWER - OFT_TITLE - OFT_UPPER - - IN_MALES - IN_FEMALES - IN_SURNAMES - IN_PLACES - IN_GAMES - IN_CELEBS - IN_NAMES - - -cpdef enum: - P2_sic - P2_cluster - P2_shape - P2_prefix - P2_suffix - P2_pos - P2_lemma - P2_pos_type - - P1_sic - P1_cluster - P1_shape - P1_prefix - P1_suffix - P1_pos - P1_lemma - P1_pos_type - - W_sic - W_cluster - W_shape - W_prefix - W_suffix - W_pos - W_lemma - W_pos_type - - N1_sic - N1_cluster - N1_shape - N1_prefix - N1_suffix - N1_pos - N1_lemma - N1_pos_type - - N2_sic - N2_cluster - N2_shape - N2_prefix - N2_suffix - N2_pos - N2_lemma - N2_pos_type - - N_CONTEXT_FIELDS - - -cdef class English(Language): - cdef int is_base_np_end(self, const TokenC* token) except -1 - cdef int is_outside_base_np(self, const TokenC* token) except -1 diff --git a/spacy/en.pyx b/spacy/en.pyx deleted file mode 100644 index 614c20bd7..000000000 --- a/spacy/en.pyx +++ /dev/null @@ -1,213 +0,0 @@ -# cython: profile=True -# cython: embedsignature=True -'''Tokenize English text, using a scheme that differs from the Penn Treebank 3 -scheme in several important respects: - -* Whitespace is added as tokens, except for single spaces. e.g., - - >>> [w.string for w in EN.tokenize(u'\\nHello \\tThere')] - [u'\\n', u'Hello', u' ', u'\\t', u'There'] - -* Contractions are normalized, e.g. - - >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")] - [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"] - -* Hyphenated words are split, with the hyphen preserved, e.g.: - - >>> [w.string for w in EN.tokenize(u'New York-based')] - [u'New', u'York', u'-', u'based'] - -Other improvements: - -* Email addresses, URLs, European-formatted dates and other numeric entities not - found in the PTB are tokenized correctly -* Heuristic handling of word-final periods (PTB expects sentence boundary detection - as a pre-process before tokenization.) - -Take care to ensure your training and run-time data is tokenized according to the -same scheme. Tokenization problems are a major cause of poor performance for -NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module -provides a fully Penn Treebank 3-compliant tokenizer. -''' -from __future__ import unicode_literals - -from murmurhash.mrmr cimport hash64 - -cimport lang -from .typedefs cimport hash_t, id_t, flags_t -import orth -from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB -from .morphology cimport X, PUNCT, EOL - -from .tokens cimport Morphology - - -DEF USE_POS_CACHE = True - - -POS_TAGS = { - 'NULL': (NO_TAG, {}), - 'EOL': (EOL, {}), - 'CC': (CONJ, {}), - 'CD': (NUM, {}), - 'DT': (DET, {}), - 'EX': (DET, {}), - 'FW': (X, {}), - 'IN': (ADP, {}), - 'JJ': (ADJ, {}), - 'JJR': (ADJ, {'misc': COMPARATIVE}), - 'JJS': (ADJ, {'misc': SUPERLATIVE}), - 'LS': (X, {}), - 'MD': (VERB, {'tenspect': MODAL}), - 'NN': (NOUN, {}), - 'NNS': (NOUN, {'number': PLURAL}), - 'NNP': (NOUN, {'misc': NAME}), - 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}), - 'PDT': (DET, {}), - 'POS': (PRT, {'case': GENITIVE}), - 'PRP': (NOUN, {}), - 'PRP$': (NOUN, {'case': GENITIVE}), - 'RB': (ADV, {}), - 'RBR': (ADV, {'misc': COMPARATIVE}), - 'RBS': (ADV, {'misc': SUPERLATIVE}), - 'RP': (PRT, {}), - 'SYM': (X, {}), - 'TO': (PRT, {}), - 'UH': (X, {}), - 'VB': (VERB, {}), - 'VBD': (VERB, {'tenspect': PAST}), - 'VBG': (VERB, {'tenspect': ING}), - 'VBN': (VERB, {'tenspect': PASSIVE}), - 'VBP': (VERB, {'tenspect': PRESENT}), - 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}), - 'WDT': (DET, {'misc': RELATIVE}), - 'WP': (PRON, {'misc': RELATIVE}), - 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}), - 'WRB': (ADV, {'misc': RELATIVE}), - '!': (PUNCT, {}), - '#': (PUNCT, {}), - '$': (PUNCT, {}), - "''": (PUNCT, {}), - "(": (PUNCT, {}), - ")": (PUNCT, {}), - "-LRB-": (PUNCT, {}), - "-RRB-": (PUNCT, {}), - ".": (PUNCT, {}), - ",": (PUNCT, {}), - "``": (PUNCT, {}), - ":": (PUNCT, {}), - "?": (PUNCT, {}), -} - - -POS_TEMPLATES = ( - (W_sic,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_sic,), - (N2_sic,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_sic), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - (P1_cluster,), - (P2_cluster,), - - (W_pos_type,), - (N1_pos_type,), - (N1_pos_type,), - (P1_pos, W_pos_type, N1_pos_type), -) - - -cdef class English(Language): - """English tokenizer, tightly coupled to lexicon. - - Attributes: - name (unicode): The two letter code used by Wikipedia for the language. - lexicon (Lexicon): The lexicon. Exposes the lookup method. - """ - def get_props(self, unicode string): - return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)} - - def set_flags(self, unicode string): - cdef flags_t flags = 0 - flags |= orth.is_alpha(string) << IS_ALPHA - flags |= orth.is_ascii(string) << IS_ASCII - flags |= orth.is_digit(string) << IS_DIGIT - flags |= orth.is_lower(string) << IS_LOWER - flags |= orth.is_punct(string) << IS_PUNCT - flags |= orth.is_space(string) << IS_SPACE - flags |= orth.is_title(string) << IS_TITLE - flags |= orth.is_upper(string) << IS_UPPER - - flags |= orth.like_url(string) << LIKE_URL - flags |= orth.like_number(string) << LIKE_NUMBER - return flags - - def set_pos(self, Tokens tokens): - cdef int i - cdef atom_t[N_CONTEXT_FIELDS] context - cdef TokenC* t = tokens.data - cdef id_t[2] bigram - cdef hash_t cache_key - cdef void* cached = NULL - assert self.morphologizer is not None - cdef dict tagdict = self.pos_tagger.tagdict - for i in range(tokens.length): - fill_pos_context(context, i, t) - t[i].pos = self.pos_tagger.predict(context) - self.morphologizer.set_morph(i, t) - - def train_pos(self, Tokens tokens, golds): - cdef int i - cdef atom_t[N_CONTEXT_FIELDS] context - c = 0 - cdef TokenC* t = tokens.data - for i in range(tokens.length): - fill_pos_context(context, i, t) - t[i].pos = self.pos_tagger.predict(context, [golds[i]]) - self.morphologizer.set_morph(i, t) - c += t[i].pos == golds[i] - return c - - cdef int is_base_np_end(self, const TokenC* token) except -1: - pass - - cdef int is_outside_base_np(self, const TokenC* token) except -1: - pass - - - -cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1: - _fill_from_token(&context[P2_sic], &tokens[i-2]) - _fill_from_token(&context[P1_sic], &tokens[i-1]) - _fill_from_token(&context[W_sic], &tokens[i]) - _fill_from_token(&context[N1_sic], &tokens[i+1]) - _fill_from_token(&context[N2_sic], &tokens[i+2]) - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.sic - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.pos - context[6] = t.lemma - context[7] = t.lex.pos_type - - -EN = English('en') diff --git a/spacy/index.pxd b/spacy/index.pxd deleted file mode 100644 index 2976150de..000000000 --- a/spacy/index.pxd +++ /dev/null @@ -1,44 +0,0 @@ -from libcpp.vector cimport vector -from libcpp.pair cimport pair - -from preshed.counter cimport count_t -from preshed.maps cimport PreshMap -from preshed.counter cimport PreshCounter -from cymem.cymem cimport Pool - -from .lang cimport Lexicon -from .tokens cimport Tokens, TokenC -from .typedefs cimport id_t -from .lexeme cimport attr_id_t -from .typedefs cimport attr_t -from .typedefs cimport hash_t - -from murmurhash.mrmr cimport hash64 - - -ctypedef vector[pair[id_t, count_t]] count_vector_t - - -cdef class Index: - cdef attr_id_t attr_id - cdef readonly attr_t max_value - cdef vector[count_vector_t] counts - - cpdef int count(self, Tokens tokens) except -1 - - -cdef class DecisionMemory: - cdef int n_classes - cdef Pool mem - cdef PreshCounter _counts - cdef PreshCounter _class_counts - cdef PreshMap memos - cdef list class_names - - cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1 - cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1 - - cdef inline int get(self, hash_t context_key) nogil: - return self.memos.get(context_key) - 1 - - diff --git a/spacy/index.pyx b/spacy/index.pyx deleted file mode 100644 index e621f584f..000000000 --- a/spacy/index.pyx +++ /dev/null @@ -1,120 +0,0 @@ -"""Create a term-document matrix""" -cimport cython -from libc.stdint cimport int64_t -from libc.string cimport memmove - -from cymem.cymem cimport Address - -from .lexeme cimport Lexeme, get_attr -from .tokens cimport TokenC -from .typedefs cimport hash_t - -from preshed.maps cimport MapStruct, Cell, map_get, map_set, map_init -from murmurhash.mrmr cimport hash64 - - -cdef class Index: - def __init__(self, attr_id_t attr_id): - self.attr_id = attr_id - self.max_value = 0 - - cpdef int count(self, Tokens tokens) except -1: - cdef PreshCounter counts = PreshCounter(2 ** 8) - cdef attr_id_t attr_id = self.attr_id - cdef attr_t term - cdef int i - for i in range(tokens.length): - term = get_attr(tokens.data[i].lex, attr_id) - counts.inc(term, 1) - if term > self.max_value: - self.max_value = term - cdef count_t count - cdef count_vector_t doc_counts - for term, count in counts: - doc_counts.push_back(pair[id_t, count_t](term, count)) - self.counts.push_back(doc_counts) - - -cdef class DecisionMemory: - def __init__(self, class_names): - self.class_names = class_names - self.n_classes = len(class_names) - self.mem = Pool() - self._counts = PreshCounter() - self._class_counts = PreshCounter() - self.memos = PreshMap() - - def load(self, loc, thresh=50): - cdef: - count_t freq - hash_t key - int clas - for line in open(loc): - freq, key, clas = [int(p) for p in line.split()] - if thresh == 0 or freq >= thresh: - self.memos.set(key, (clas+1)) - - def __getitem__(self, ids): - cdef id_t[2] context - context[0] = context[0] - context[1] = context[1] - cdef hash_t context_key = hash64(context, 2 * sizeof(id_t), 0) - cdef hash_t[2] class_context - class_context[0] = context_key - counts = {} - cdef id_t i - for i, clas in enumerate(self.clas_names): - class_context[1] = i - key = hash64(class_context, sizeof(hash_t) * 2, 0) - count = self._class_counts[key] - counts[clas] = count - return counts - - @cython.cdivision(True) - def iter_contexts(self, float min_acc=0.99, count_t min_freq=10): - cdef Address counts_addr = Address(self.n_classes, sizeof(count_t)) - cdef count_t* counts = counts_addr.ptr - cdef MapStruct* context_counts = self._counts.c_map - cdef hash_t context_key - cdef count_t context_freq - cdef int best_class - cdef float acc - - cdef int i - for i in range(context_counts.length): - context_key = context_counts.cells[i].key - context_freq = context_counts.cells[i].value - if context_key != 0 and context_freq >= min_freq: - best_class = self.find_best_class(counts, context_key) - acc = counts[best_class] / context_freq - if acc >= min_acc: - yield counts[best_class], context_key, best_class - - cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1: - cdef hash_t context_and_class_key - cdef hash_t[2] context_and_class - context_and_class[0] = context_key - context_and_class[1] = clas - context_and_class_key = hash64(context_and_class, 2 * sizeof(hash_t), 0) - self._counts.inc(context_key, inc) - self._class_counts.inc(context_and_class_key, inc) - - cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1: - cdef hash_t[2] unhashed_key - unhashed_key[0] = context_key - - cdef count_t total = 0 - cdef hash_t key - cdef int clas - cdef int best - cdef int mode = 0 - for clas in range(self.n_classes): - unhashed_key[1] = clas - key = hash64(unhashed_key, sizeof(hash_t) * 2, 0) - count = self._class_counts[key] - counts[clas] = count - if count >= mode: - mode = count - best = clas - total += count - return best diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py deleted file mode 100644 index ce9bbefdc..000000000 --- a/spacy/lemmatizer.py +++ /dev/null @@ -1,90 +0,0 @@ -from os import path - - -NOUN_RULES = ( - ('s', ''), - ('ses', 's'), - ('ves', 'f'), - ('xes', 'x'), - ('zes', 'z'), - ('ches', 'ch'), - ('shes', 'sh'), - ('men', 'man'), - ('ies', 'y') -) - - -VERB_RULES = ( - ("s", ""), - ("ies", "y"), - ("es", "e"), - ("es", ""), - ("ed", "e"), - ("ed", ""), - ("ing", "e"), - ("ing", "") -) - - -ADJ_RULES = ( - ("er", ""), - ("est", ""), - ("er", "e"), - ("est", "e") -) - - -class Lemmatizer(object): - def __init__(self, wn_dict_dir): - self.index = {} - self.exc = {} - for pos in ['adj', 'adv', 'noun', 'verb']: - self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) - self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) - - def noun(self, string): - return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) - - def verb(self, string): - return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) - - def adj(self, string): - return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) - - -def lemmatize(string, index, exceptions, rules): - string = string.lower() - forms = [] - if string in index: - forms.append(string) - forms.extend(exceptions.get(string, [])) - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if form in index: - forms.append(form) - if not forms: - forms.append(string) - return set(forms) - - -def read_index(loc): - index = set() - for line in open(loc): - if line.startswith(' '): - continue - pieces = line.split() - word = pieces[0] - if word.count('_') == 0: - index.add(word) - return index - - -def read_exc(loc): - exceptions = {} - for line in open(loc): - if line.startswith(' '): - continue - pieces = line.split() - exceptions[pieces[0]] = tuple(pieces[1:]) - return exceptions diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 30e4aef4c..5401de3ad 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -36,11 +36,11 @@ cdef struct _Cached: cdef class Morphologizer: """Given a POS tag and a Lexeme, find its lemma and morphological analysis. """ - def __init__(self, StringStore strings, object lemmatizer, **kwargs): + def __init__(self, StringStore strings, object lemmatizer, + irregulars=None, tag_map=None, tag_names=None): self.mem = Pool() self.strings = strings - tag_map = kwargs['tag_map'] - self.tag_names = kwargs['tag_names'] + self.tag_names = tag_names self.lemmatizer = lemmatizer self._cache = PreshMapArray(len(self.tag_names)) self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) @@ -55,9 +55,16 @@ cdef class Morphologizer: self.tags[i].morph.person = props.get('person', 0) self.tags[i].morph.case = props.get('case', 0) self.tags[i].morph.misc = props.get('misc', 0) - #if path.exists(path.join(data_dir, 'morphs.json')): - # with open(path.join(data_dir, 'morphs.json')) as file_: - # self.load_exceptions(json.load(file_)) + if irregulars is not None: + self.load_exceptions(irregulars) + + @classmethod + def from_dir(cls, StringStore strings, object lemmatizer, data_dir): + tag_map = None + irregulars = None + tag_names = None + return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars, + tag_names=tag_names) cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: if self.lemmatizer is None: @@ -86,7 +93,6 @@ cdef class Morphologizer: cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) cached.morph = tag.morph self._cache.set(tag.id, tokens[i].lex.sic, cached) - tokens[i].lemma = cached.lemma tokens[i].morph = cached.morph diff --git a/spacy/ner/__init__.pxd b/spacy/ner/__init__.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/ner/__init__.py b/spacy/ner/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/ner/_feats.pxd b/spacy/ner/_feats.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/ner/_feats.pyx b/spacy/ner/_feats.pyx deleted file mode 100644 index 18e073c5b..000000000 --- a/spacy/ner/_feats.pyx +++ /dev/null @@ -1,169 +0,0 @@ -from spacy.context cimport FIELD_IDS, Token - - -cdef Token P4 = FIELD_IDS.P4 -cdef Token P3 = FIELD_IDS.P3 -cdef Token P2 = FIELD_IDS.P2 -cdef Token P1 = FIELD_IDS.P1 -cdef Token N0 = FIELD_IDS.N0 -cdef Token N1 = FIELD_IDS.N1 -cdef Token N2 = FIELD_IDS.N2 -cdef Token N3 = FIELD_IDS.N3 -cdef Token N4 = FIELD_IDS.N4 - -""" -TEMPLATES = ( - (N0.sic,), - (N0.cluster,), - - (P1.pos,), - (P1.sic,), - - (N1.norm,), - (N1.pos,), - - (P1.ner,), - (P2.ner,), - - (N0.cluster,), - (P1.cluster,), - (N1.cluster,), - - (N0.is_alpha,), - (N0.is_digit,), - (N0.is_title,), - (N0.is_upper,), - - (N0.is_title, N0.oft_title), - (N0.is_upper, N0.oft_upper), - - (P1.cluster, N0.norm), - (N0.norm, N1.cluster), - - (P1.ner, N0.pos), - (P2.ner, P1.ner, N0.pos), - - (P2.pos, P1.pos, N0.sic), - (N0.sic, N1.pos, N2.pos) -) -""" - -LOCAL = ( - (N0.sic,), - (P1.sic,), - (N1.sic,), - (P2.sic,), - (N2.sic,), - (P3.sic,), - (N3.sic,), - (P4.sic,), - (N4.sic,), - - (P1.sic, N0.sic,), - (N0.sic, N1.sic), - - (N0.prefix,), - (N0.suffix,), - - (P1.shape,), - (N0.shape,), - (N1.shape,), - (P1.shape, N0.shape,), - (N0.shape, P1.shape,), - (P1.shape, N0.shape, N1.shape), - (N2.shape,), - (P2.shape,), - (P3.shape,), - (N3.shape,), - (P4.shape,), - (N4.shape,), - - (P2.norm, P1.norm, N0.norm), - (P1.norm, N0.norm, N1.norm), - (N0.norm, N1.norm, N2.norm) -) - -BOOLS = ( - (N0.is_title,), -) - - -HISTORY = ( - (P1.ner,), - (P1.ner, N0.sic,), - (P2.ner,), - (P2.ner, P1.ner), - (P2.ner, P1.ner, N0.sic), - (P2.pos, P1.ner, N0.pos), - (P2.ner, P1.pos, N0.pos), - (P3.ner,), - (P4.ner,), -) - -POS = ( - (P4.pos,), - (P3.pos,), - (P2.pos,), - (P1.pos,), - (N0.pos,), - (N1.pos,), - (N2.pos,), - (N3.pos,), - (N4.pos,), - - (P1.pos, N0.pos), - (N0.pos, N1.pos), - (P2.pos, P1.pos, N0.pos), - (P1.pos, N0.pos, N1.pos), - (N0.pos, N1.pos, N2.pos) -) - -CLUSTERS = ( - (P4.cluster,), - (P3.cluster,), - (P2.cluster,), - (P1.cluster,), - (N0.cluster,), - (N1.cluster,), - (N2.cluster,), - (N3.cluster,), - (N4.cluster,), - - (P1.cluster, N0.cluster), - (N0.cluster, N1.cluster), -) - - -CLUSTER_POS = ( - (P1.cluster, N0.pos), - (N0.pos, P1.cluster), - (N0.cluster, N1.pos), - (N0.pos, N1.cluster) -) - - -GAZ = ( - (N0.in_males,), - (N0.in_females,), - (N0.in_surnames,), - (N0.in_places,), - (N0.in_games,), - (N0.in_celebs,), - (N0.in_names,), - (P1.in_males,), - (P1.in_females,), - (P1.in_surnames,), - (P1.in_places,), - (P1.in_games,), - (P1.in_celebs,), - (P1.in_names,), - (N1.in_males,), - (N1.in_females,), - (N1.in_surnames,), - (N1.in_places,), - (N1.in_games,), - (N1.in_celebs,), - (N1.in_names,), -) - -TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS diff --git a/spacy/ner/_state.pxd b/spacy/ner/_state.pxd deleted file mode 100644 index 2c5815799..000000000 --- a/spacy/ner/_state.pxd +++ /dev/null @@ -1,15 +0,0 @@ -from cymem.cymem cimport Pool -from .structs cimport State, Entity, Move - -cdef int begin_entity(State* s, label) except -1 - -cdef int end_entity(State* s) except -1 - -cdef State* init_state(Pool mem, int sent_length) except NULL -cdef int copy_state(Pool mem, State* dest, State* source) except -1 - -cdef bint entity_is_open(State *s) except -1 - -cdef int entity_is_sunk(State *s, Move* golds) except -1 - -cdef int is_done(State* s) except -1 diff --git a/spacy/ner/_state.pyx b/spacy/ner/_state.pyx deleted file mode 100644 index 0a63c4d43..000000000 --- a/spacy/ner/_state.pyx +++ /dev/null @@ -1,54 +0,0 @@ -from libc.string cimport memcpy - - -cdef int begin_entity(State* s, label) except -1: - s.j += 1 - s.ents[s.j].start = s.i - s.ents[s.j].tag = label - s.ents[s.j].end = s.i + 1 - - -cdef int end_entity(State* s) except -1: - s.ents[s.j].end = s.i + 1 - - -cdef State* init_state(Pool mem, int sent_length) except NULL: - s = mem.alloc(1, sizeof(State)) - s.ents = mem.alloc(sent_length, sizeof(Entity)) - s.tags = mem.alloc(sent_length, sizeof(int)) - s.length = sent_length - - -cdef bint entity_is_open(State *s) except -1: - return s.ents[s.j].start != 0 - - -cdef int entity_is_sunk(State *s, Move* golds) except -1: - if not entity_is_open(s): - return False - raise StandardError - #cdef Entity* ent = &s.ents[s.j] - #cdef Move* gold = &golds[ent.start] - #if gold.action != BEGIN and gold.action != UNIT: - # return True - #elif gold.label != ent.label: - # return True - #else: - # return False - - -cdef int copy_state(Pool mem, State* dest, State* source) except -1: - '''Copy state source into state dest.''' - if source.length > dest.length: - dest.ents = mem.realloc(dest.ents, source.length * sizeof(Entity)) - dest.tags = mem.realloc(dest.tags, source.length * sizeof(int)) - memcpy(dest.ents, source.ents, source.length * sizeof(Entity)) - memcpy(dest.tags, source.tags, source.length * sizeof(int)) - dest.length = source.length - dest.i = source.i - dest.j = source.j - dest.curr = source.curr - - -cdef int is_done(State* s) except -1: - return s.i >= s.length and not entity_is_open(s) diff --git a/spacy/ner/annot.pxd b/spacy/ner/annot.pxd deleted file mode 100644 index b1b49d64f..000000000 --- a/spacy/ner/annot.pxd +++ /dev/null @@ -1,8 +0,0 @@ -from cymem.cymem cimport Pool - -cdef class NERAnnotation: - cdef Pool mem - cdef int* starts - cdef int* ends - cdef int* labels - cdef readonly list entities diff --git a/spacy/ner/annot.pyx b/spacy/ner/annot.pyx deleted file mode 100644 index d04345319..000000000 --- a/spacy/ner/annot.pyx +++ /dev/null @@ -1,94 +0,0 @@ -from libc.string cimport memset - - -cdef class NERAnnotation: - def __init__(self, entities, length, entity_types): - self.mem = Pool() - self.starts = self.mem.alloc(length, sizeof(int)) - self.ends = self.mem.alloc(length, sizeof(int)) - self.labels = self.mem.alloc(length, sizeof(int)) - self.entities = entities - memset(self.starts, -1, sizeof(int) * length) - memset(self.ends, -1, sizeof(int) * length) - memset(self.labels, -1, sizeof(int) * length) - - cdef int start, end, label - for start, end, label in entities: - for i in range(start, end): - self.starts[i] = start - self.ends[i] = end - self.labels[i] = label - - @classmethod - def from_bilous(cls, tag_strs, entity_types): - entities = [] - start = None - for i, tag_str in enumerate(tag_strs): - if tag_str == 'O' or tag_str == '-': - continue - move, label_str = tag_str.split('-') - label = entity_types.index(label_str) - if label == -1: - label = len(entity_types) - entity_types.append(label) - if move == 'U': - assert start is None - entities.append((i, i+1, label)) - elif move == 'B': - assert start is None - start = i - elif move == 'L': - assert start is not None - entities.append((start, i+1, label)) - start = None - return cls(entities, len(tag_strs), entity_types) - - - -def read_iob(file_, entity_types, create_tokens): - sent_strs = file_.read().strip().split('\n\n') - sents = [] - for sent_str in sent_strs: - if sent_str.startswith('-DOCSTART-'): - continue - words = [] - iob = [] - for token_str in sent_str.split('\n'): - word, pos, chunk, ner = token_str.split() - words.append(word) - iob.append(ner) - bilou = iob_to_bilou(iob) - tokens = create_tokens(words) - sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types))) - return sents - - -def iob_to_bilou(tags): - out = [] - curr_label = None - tags = list(tags) - while tags: - out.extend(_consume_os(tags)) - out.extend(_consume_ent(tags)) - return out - -def _consume_os(tags): - while tags and tags[0] == 'O': - yield tags.pop(0) - -def _consume_ent(tags): - if not tags: - return [] - target = tags.pop(0).replace('B', 'I') - length = 1 - while tags and tags[0] == target: - length += 1 - tags.pop(0) - label = target[2:] - if length == 1: - return ['U-' + label] - else: - start = 'B-' + label - end = 'L-' + label - middle = ['I-%s' % label for _ in range(1, length - 1)] - return [start] + middle + [end] diff --git a/spacy/ner/bilou_moves.pxd b/spacy/ner/bilou_moves.pxd deleted file mode 100644 index 20ec58291..000000000 --- a/spacy/ner/bilou_moves.pxd +++ /dev/null @@ -1,27 +0,0 @@ -from cymem.cymem cimport Pool - -from thinc.typedefs cimport class_t -from thinc.typedefs cimport weight_t - -from .structs cimport State, Move - - -cpdef enum ActionType: - MISSING - BEGIN - IN - LAST - UNIT - OUT - N_ACTIONS - - -cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0 - -cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 - -cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL - -cdef int transition(State *s, Move* m) except -1 - -cdef int fill_moves(Move* moves, list tag_names) except -1 diff --git a/spacy/ner/bilou_moves.pyx b/spacy/ner/bilou_moves.pyx deleted file mode 100644 index 42cef3fb7..000000000 --- a/spacy/ner/bilou_moves.pyx +++ /dev/null @@ -1,207 +0,0 @@ -from __future__ import unicode_literals - -from ._state cimport begin_entity -from ._state cimport end_entity -from ._state cimport entity_is_open -from ._state cimport entity_is_sunk - - -ACTION_NAMES = ['' for _ in range(N_ACTIONS)] -ACTION_NAMES[MISSING] = '?' -ACTION_NAMES[BEGIN] = 'B' -ACTION_NAMES[IN] = 'I' -ACTION_NAMES[LAST] = 'L' -ACTION_NAMES[UNIT] = 'U' -ACTION_NAMES[OUT] = 'O' - - -cdef bint can_begin(State* s, int label): - return not entity_is_open(s) - - -cdef bint can_in(State* s, int label): - return entity_is_open(s) and s.curr.label == label - - -cdef bint can_last(State* s, int label): - return entity_is_open(s) and s.curr.label == label - - -cdef bint can_unit(State* s, int label): - return not entity_is_open(s) - - -cdef bint can_out(State* s, int label): - return not entity_is_open(s) - - -cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag, - ActionType next_act, bint is_sunk): - if g_act == MISSING: - return True - if act == BEGIN: - if g_act == BEGIN: - # B, Gold B --> Label match - return tag == g_tag - else: - # B, Gold I --> False (P) - # B, Gold L --> False (P) - # B, Gold O --> False (P) - # B, Gold U --> False (P) - return False - elif act == IN: - if g_act == BEGIN: - # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) - return True - elif g_act == IN: - # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) - return True - elif g_act == LAST: - # I, Gold L --> True iff this entity sunk and next tag == O - return is_sunk and (next_act == OUT or next_act == MISSING) - elif g_act == OUT: - # I, Gold O --> True iff next tag == O - return next_act == OUT or next_act == MISSING - elif g_act == UNIT: - # I, Gold U --> True iff next tag == O - return next_act == OUT - elif act == LAST: - if g_act == BEGIN: - # L, Gold B --> True - return True - elif g_act == IN: - # L, Gold I --> True iff this entity sunk - return is_sunk - elif g_act == LAST: - # L, Gold L --> True - return True - elif g_act == OUT: - # L, Gold O --> True - return True - elif g_act == UNIT: - # L, Gold U --> True - return True - elif act == OUT: - if g_act == BEGIN: - # O, Gold B --> False - return False - elif g_act == IN: - # O, Gold I --> True - return True - elif g_act == LAST: - # O, Gold L --> True - return True - elif g_act == OUT: - # O, Gold O --> True - return True - elif g_act == UNIT: - # O, Gold U --> False - return False - elif act == UNIT: - if g_act == UNIT: - # U, Gold U --> True iff tag match - return tag == g_tag - else: - # U, Gold B --> False - # U, Gold I --> False - # U, Gold L --> False - # U, Gold O --> False - return False - - -cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0: - cdef int n_accept = 0 - cdef Move* m - moves[0].accept = False - for i in range(1, n_classes): - m = &moves[i] - if m.action == BEGIN: - m.accept = can_begin(s, m.label) - elif m.action == IN: - m.accept = can_in(s, m.label) - elif m.action == LAST: - m.accept = can_last(s, m.label) - elif m.action == UNIT: - m.accept = can_unit(s, m.label) - elif m.action == OUT: - m.accept = can_out(s, m.label) - n_accept += m.accept - assert n_accept != 0 - return n_accept - - -cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0: - - cdef Move* g = &golds[s.i] - cdef ActionType next_act = golds[s.i+1].action if s.i < s.length else OUT - cdef bint is_sunk = entity_is_sunk(s, golds) - cdef Move* m - cdef int n_accept = 0 - set_accept_if_valid(moves, n_classes, s) - for i in range(1, n_classes): - m = &moves[i] - if not m.accept: - continue - m.accept = is_oracle(m.action, m.label, g.action, - g.label, next_act, is_sunk) - n_accept += m.accept - assert n_accept != 0 - return n_accept - - -cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: - cdef int first_accept = -1 - for first_accept in range(1, n): - if moves[first_accept].accept: - break - else: - raise StandardError - assert first_accept != -1 - cdef int best = first_accept - cdef weight_t score = scores[first_accept-1] - cdef int i - for i in range(first_accept+1, n): - if moves[i].accept and scores[i-1] > score: - best = i - score = scores[i-1] - return &moves[best] - - -cdef int transition(State *s, Move* move) except -1: - if move.action == BEGIN: - begin_entity(s, move.label) - elif move.action == IN: - pass - elif move.action == LAST: - end_entity(s) - elif move.action == UNIT: - begin_entity(s, move.label) - end_entity(s) - elif move.action == OUT: - pass - s.tags[s.i] = move.clas - s.i += 1 - - -def get_n_moves(n_tags): - return n_tags + n_tags + n_tags + n_tags + 1 - - -cdef int fill_moves(Move* moves, list tag_names) except -1: - cdef Move* m - label_names = {'-': 0} - for i, tag_name in enumerate(tag_names): - m = &moves[i] - if '-' in tag_name: - action_str, label = tag_name.split('-') - elif tag_name == 'O': - action_str = 'O' - label = '-' - elif tag_name == 'NULL' or tag_name == 'EOL': - action_str = '?' - label = '-' - else: - raise StandardError(tag_name) - m.action = ACTION_NAMES.index(action_str) - m.label = label_names.setdefault(label, len(label_names)) - m.clas = i diff --git a/spacy/ner/context.pxd b/spacy/ner/context.pxd deleted file mode 100644 index c12ecc041..000000000 --- a/spacy/ner/context.pxd +++ /dev/null @@ -1,155 +0,0 @@ -from thinc.typedefs cimport atom_t -from ..typedefs cimport hash_t -from ..tokens cimport Tokens -from ..lexeme cimport Lexeme -from .structs cimport State - - -cpdef enum: - T_sic - T_cluster - T_norm - T_shape - T_asciied - T_prefix - T_suffix - T_length - T_postype - T_nertype - T_sensetype - T_is_alpha - T_is_ascii - T_is_digit - T_is_lower - T_is_punct - T_is_space - T_is_title - T_is_upper - T_like_url - T_like_number - T_oft_lower - T_oft_title - T_oft_upper - T_in_males - T_in_females - T_in_surnames - T_in_places - T_in_celebs - T_in_names - T_pos - T_sense - T_ner - - -cpdef enum: - P2_sic - P2_cluster - P2_norm - P2_shape - P2_prefix - P2_suffix - P2_length - P2_postype - P2_is_alpha - P2_is_digit - P2_is_lower - P2_is_punct - P2_is_title - P2_is_upper - P2_like_number - P2_pos - - P1_sic - P1_cluster - P1_norm - P1_shape - P1_prefix - P1_suffix - P1_length - P1_postype - P1_is_alpha - P1_is_digit - P1_is_lower - P1_is_punct - P1_is_title - P1_is_upper - P1_like_number - P1_pos - - W_sic - W_cluster - W_norm - W_shape - W_prefix - W_suffix - W_length - W_postype - W_is_alpha - W_is_digit - W_is_lower - W_is_punct - W_is_space - W_is_title - W_is_upper - W_like_number - W_pos - - N1_sic - N1_cluster - N1_norm - N1_shape - N1_prefix - N1_suffix - N1_length - N1_postype - N1_is_alpha - N1_is_ascii - N1_is_digit - N1_is_lower - N1_is_punct - N1_is_space - N1_is_title - N1_is_upper - N1_like_number - N1_pos - - N2_sic - N2_cluster - N2_norm - N2_shape - N2_asciied - N2_prefix - N2_suffix - N2_length - N2_postype - N2_is_alpha - N2_is_digit - N2_is_lower - N2_is_punct - N2_is_space - N2_is_title - N2_is_upper - N2_like_number - N2_pos - N2_sense - - E_label - - E0_sic - E0_cluster - E0_pos - - E1_sic - E1_cluster - E1_pos - - E_last_sic - E_last_cluster - E_last_pos - - N_FIELDS - - -cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1 - - diff --git a/spacy/ner/context.pyx b/spacy/ner/context.pyx deleted file mode 100644 index c22685dfd..000000000 --- a/spacy/ner/context.pyx +++ /dev/null @@ -1,77 +0,0 @@ -from libc.string cimport memset - -from murmurhash.mrmr cimport hash64 -from ._state cimport entity_is_open -from ..lexeme cimport * - - -cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos): - c[T_sic] = lex.sic - c[T_cluster] = lex.cluster - c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape - c[T_shape] = lex.shape - c[T_asciied] = lex.asciied - c[T_prefix] = lex.prefix - c[T_suffix] = lex.suffix - c[T_length] = lex.length - - c[T_postype] = lex.postype - c[T_nertype] = 0 - c[T_sensetype] = 0 - - c[T_is_alpha] = lex.flags & (1 << IS_ALPHA) - c[T_is_digit] = lex.flags & (1 << IS_DIGIT) - c[T_is_lower] = lex.flags & (1 << IS_LOWER) - c[T_is_punct] = lex.flags & (1 << IS_PUNCT) - c[T_is_space] = lex.flags & (1 << IS_SPACE) - c[T_is_title] = lex.flags & (1 << IS_TITLE) - c[T_is_upper] = lex.flags & (1 << IS_UPPER) - c[T_like_url] = lex.flags & (1 << LIKE_URL) - c[T_like_number] = lex.flags & (1 << LIKE_NUMBER) - c[T_oft_lower] = lex.flags & (1 << OFT_LOWER) - c[T_oft_title] = lex.flags & (1 << OFT_TITLE) - c[T_oft_upper] = lex.flags & (1 << OFT_UPPER) - - c[T_in_males] = lex.flags & (1 << IN_MALES) - c[T_in_females] = lex.flags & (1 << IN_FEMALES) - c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES) - c[T_in_places] = lex.flags & (1 << IN_PLACES) - c[T_in_celebs] = lex.flags & (1 << IN_CELEBS) - c[T_in_names] = lex.flags & (1 << IN_NAMES) - - c[T_pos] = pos - c[T_sense] = 0 - - -cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos): - c[0] = lex.sic - c[1] = lex.cluster - c[2] = lex.shape - c[3] = pos - - -cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1: - cdef int i - for i in range(N_FIELDS): - context[i] = 0 - i = s.i - _fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2]) - _fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1]) - _fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i]) - _fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1]) - _fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2]) - - cdef atom_t[5] ent_vals - if entity_is_open(s): - context[E_label] = s.curr.label - context[E0_sic] = tokens.lex[s.curr.start].sic - context[E0_cluster] = tokens.lex[s.curr.start].cluster - context[E0_pos] = tokens.pos[s.curr.start] - context[E_last_sic] = tokens.lex[s.i-1].sic - context[E_last_cluster] = tokens.lex[s.i-1].cluster - context[E_last_pos] = tokens.pos[s.i-1] - if (s.curr.start + 1) < s.i: - context[E1_sic] = tokens.lex[s.curr.start+1].sic - context[E1_cluster] = tokens.lex[s.curr.start+1].cluster - context[E1_pos] = tokens.pos[s.curr.start+1] - return 1 diff --git a/spacy/ner/feats.pxd b/spacy/ner/feats.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/ner/feats.pyx b/spacy/ner/feats.pyx deleted file mode 100644 index e28225632..000000000 --- a/spacy/ner/feats.pyx +++ /dev/null @@ -1,107 +0,0 @@ -from .context import * - - -LOCAL = ( - (W_sic,), - (P1_sic,), - (N1_sic,), - (P2_sic,), - (N2_sic,), - - (P1_sic, W_sic,), - (W_sic, N1_sic), - - (W_prefix,), - (W_suffix,), - - (P1_shape,), - (W_shape,), - (N1_shape,), - (P1_shape, W_shape,), - (W_shape, P1_shape,), - (P1_shape, W_shape, N1_shape), - (N2_shape,), - (P2_shape,), - - (P2_norm, P1_norm, W_norm), - (P1_norm, W_norm, N1_norm), - (W_norm, N1_norm, N2_norm) -) - -POS = ( - (P2_pos,), - (P1_pos,), - (W_pos,), - (N1_pos,), - (N2_pos,), - - (P1_pos, W_pos), - (W_pos, N1_pos), - (P2_pos, P1_pos, W_pos), - (P1_pos, W_pos, N1_pos), - (W_pos, N1_pos, N2_pos) -) - -CLUSTERS = ( - (P2_cluster,), - (P1_cluster,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - - (P1_cluster, W_cluster), - (W_cluster, N1_cluster), -) - - -CLUSTER_POS = ( - (P1_cluster, W_pos), - (W_pos, P1_cluster), - (W_cluster, N1_pos), - (W_pos, N1_cluster) -) - - -STATE = ( - (E0_sic,), - (E0_cluster,), - (E0_pos,), - (E_last_sic,), - (E_last_cluster,), - (E_last_pos,), - - (E0_sic, W_sic), - (E0_cluster, W_cluster), - (E0_pos, W_pos), - (E_last_sic, W_sic), - (E_last_pos, W_pos), - - (E0_pos, E_last_pos, W_pos), - (E0_cluster, E_last_cluster, W_cluster), - - (E0_sic, E_last_sic), - (E0_pos, E_last_pos), - (E0_cluster, E_last_cluster), - (E0_pos, E_last_cluster), - (E0_cluster, E_last_pos), - - (E1_sic,), - (E1_cluster,), - (E1_pos,), - - (E0_sic, E1_sic), - (E0_sic, E1_pos,), - (E0_pos, E1_sic,), - (E0_pos, E1_pos), - - (E_label,), - (E_label, W_sic), - (E_label, W_pos), - (E_label, W_cluster), - (E_label, W_shape), - (E_label, E_last_sic), - (E_label, E0_pos, E_last_pos), -) - - -TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE diff --git a/spacy/ner/greedy_parser.pxd b/spacy/ner/greedy_parser.pxd deleted file mode 100644 index 9ee4d668d..000000000 --- a/spacy/ner/greedy_parser.pxd +++ /dev/null @@ -1,29 +0,0 @@ -from cymem.cymem cimport Pool -from thinc.features cimport Extractor -from thinc.learner cimport LinearModel -from thinc.typedefs cimport * - -from ..tokens cimport Tokens -from ..typedefs cimport * - -from .structs cimport Move -from .annot cimport NERAnnotation - - -cdef class NERParser: - cdef Pool mem - cdef Extractor extractor - cdef LinearModel model - cdef readonly list tag_names - cdef readonly list entity_types - cdef readonly int n_classes - - cdef Move* _moves - cdef atom_t* _context - cdef feat_t* _feats - cdef weight_t* _values - cdef weight_t* _scores - - - cpdef list train(self, Tokens tokens, NERAnnotation annot) - cpdef list set_tags(self, Tokens tokens) diff --git a/spacy/ner/greedy_parser.pyx b/spacy/ner/greedy_parser.pyx deleted file mode 100644 index 2e3af5717..000000000 --- a/spacy/ner/greedy_parser.pyx +++ /dev/null @@ -1,81 +0,0 @@ -cimport cython -import random -import os -from os import path -import shutil -import json - -from thinc.features cimport ConjFeat - -from ..context cimport fill_context -from ..context cimport N_FIELDS -from .moves cimport Move -from .moves cimport fill_moves, transition, best_accepted -from .moves cimport set_accept_if_valid, set_accept_if_oracle -from .moves import get_n_moves -from ._state cimport State -from ._state cimport init_state - - -cdef class NERParser: - def __init__(self, model_dir): - self.mem = Pool() - cfg = json.load(open(path.join(model_dir, 'config.json'))) - templates = cfg['templates'] - self.entity_types = cfg['entity_types'] - self.extractor = Extractor(templates, [ConjFeat] * len(templates)) - self.n_classes = get_n_moves(len(self.entity_types)) - self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) - fill_moves(self._moves, len(self.entity_types)) - self.model = LinearModel(len(self.tag_names)) - if path.exists(path.join(model_dir, 'model')): - self.model.load(path.join(model_dir, 'model')) - - self._context = self.mem.alloc(N_FIELDS, sizeof(atom_t)) - self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) - self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) - self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t)) - - cpdef int train(self, Tokens tokens, gold_classes): - cdef Pool mem = Pool() - cdef State* s = init_state(mem, tokens.length) - cdef Move* golds = mem.alloc(len(gold_classes), sizeof(Move)) - for i, clas in enumerate(gold_classes): - golds[i] = self.moves[clas - 1] - assert golds[i].id == clas - cdef Move* guess - while s.i < tokens.length: - fill_context(self._context, s.i, tokens) - self.extractor.extract(self._feats, self._values, self._context, NULL) - self.model.score(self._scores, self._feats, self._values) - - set_accept_if_valid(self._moves, self.n_classes, s) - guess = best_accepted(self._moves, self._scores, self.n_classes) - - set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO - gold = best_accepted(self._moves, self._scores, self.n_classes) - - if guess.clas == gold.clas: - self.model.update({}) - return 0 - - counts = {guess.clas: {}, gold.clas: {}} - self.extractor.count(counts[gold.clas], self._feats, 1) - self.extractor.count(counts[guess.clas], self._feats, -1) - self.model.update(counts) - - transition(s, guess) - tokens.ner[s.i-1] = s.tags[s.i-1] - - cpdef int set_tags(self, Tokens tokens) except -1: - cdef Pool mem = Pool() - cdef State* s = init_state(mem, tokens.length) - cdef Move* move - while s.i < tokens.length: - fill_context(self._context, s.i, tokens) - self.extractor.extract(self._feats, self._values, self._context, NULL) - self.model.score(self._scores, self._feats, self._values) - set_accept_if_valid(self._moves, self.n_classes, s) - move = best_accepted(self._moves, self._scores, self.n_classes) - transition(s, move) - tokens.ner[s.i-1] = s.tags[s.i-1] diff --git a/spacy/ner/io_moves.pxd b/spacy/ner/io_moves.pxd deleted file mode 100644 index 97f9512e8..000000000 --- a/spacy/ner/io_moves.pxd +++ /dev/null @@ -1,26 +0,0 @@ -from cymem.cymem cimport Pool - -from thinc.typedefs cimport class_t -from thinc.typedefs cimport weight_t - -from .structs cimport State, Move - - -cpdef enum ActionType: - MISSING - SHIFT - REDUCE - OUT - N_ACTIONS - - -cdef int set_accept_if_oracle(Move* moves, int n, State* s, - int* g_starts, int* g_ends, int* g_labels) except 0 - -cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 - -cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL - -cdef int transition(State *s, Move* m) except -1 - -cdef int fill_moves(Move* moves, int n, list entity_types) except -1 diff --git a/spacy/ner/io_moves.pyx b/spacy/ner/io_moves.pyx deleted file mode 100644 index 6e892ddf5..000000000 --- a/spacy/ner/io_moves.pyx +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import unicode_literals -from cymem.cymem cimport Pool - -from thinc.typedefs cimport class_t -from thinc.typedefs cimport weight_t - -from ._state cimport begin_entity -from ._state cimport end_entity -from ._state cimport entity_is_open - - -ACTION_NAMES = ['' for _ in range(N_ACTIONS)] -ACTION_NAMES[MISSING] = '?' -ACTION_NAMES[SHIFT] = 'S' -ACTION_NAMES[REDUCE] = 'R' -ACTION_NAMES[OUT] = 'O' - - -cdef int set_accept_if_oracle(Move* moves, int n, State* s, - int* g_starts, int* g_ends, int* g_labels) except 0: - # If curr entity: (O invalid) - # if cost is not sunk (start matches, end is i-1 or greater - # - If i-1 == gold.end --> R=True, S=False - # - Shift if end >= i --> S=True, R=False - # else - # - If i == gold.start --> R=True, S=False - # - Else --> R=True, S=True - # Else (R invalid): - # if start == gold.start: S=True, O=False - # else: O=True, S=False - if entity_is_open(s): - g_start = g_starts[s.curr.start] - g_end = g_ends[s.curr.start] - accept_o = False - if g_start == s.curr.start and g_end == s.i: - accept_r = True - accept_s = False - elif g_start == s.curr.start and g_end > s.i: - accept_s = True - s_label = s.curr.label - accept_r = False - elif g_starts[s.i] == s.i: - accept_r = True - accept_s = False - else: - accept_r = True - accept_s = True - s_label = s.curr.label - else: - accept_r = False - if g_starts[s.i] == s.i: - accept_s = True - s_label = g_labels[s.i] - accept_o = False - else: - accept_o = True - accept_s = False - n_accept = 0 - moves[0].accept = False - for i in range(1, n): - m = &moves[i] - if m.action == SHIFT: - m.accept = accept_s and m.label == s_label - elif m.action == REDUCE: - m.accept = accept_r - elif m.action == OUT: - m.accept = accept_o - n_accept += m.accept - assert n_accept != 0 - return n_accept - - -cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0: - cdef int i - cdef bint open_ent = entity_is_open(s) - cdef int n_accept = 0 - moves[0].accept = False - for i in range(1, n): - if moves[i].action == SHIFT: - if s.i >= s.length: - moves[i].accept = False - elif open_ent and moves[i].label != s.curr.label: - moves[i].accept = False - else: - moves[i].accept = True - elif moves[i].action == REDUCE: - moves[i].accept = open_ent - elif moves[i].action == OUT: - moves[i].accept = s.i < s.length and not open_ent - n_accept += moves[i].accept - return n_accept - - -cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: - cdef int first_accept = -1 - for first_accept in range(1, n): - if moves[first_accept].accept: - break - else: - raise StandardError - assert first_accept != -1 - cdef int best = first_accept - cdef weight_t score = scores[first_accept-1] - cdef int i - for i in range(first_accept+1, n): - if moves[i].accept and scores[i-1] > score: - best = i - score = scores[i-1] - return &moves[best] - - -cdef int transition(State *s, Move* move) except -1: - s.tags[s.i] = move.clas - if move.action == OUT: - s.i += 1 - elif move.action == SHIFT: - if not entity_is_open(s): - s.curr.start = s.i - s.curr.label = move.label - s.i += 1 - elif move.action == REDUCE: - s.curr.end = s.i - s.ents[s.j] = s.curr - s.j += 1 - s.curr.start = 0 - s.curr.label = -1 - s.curr.end = 0 - else: - raise ValueError(move.action) - - -def get_n_moves(n_tags): - return 1 + 1 + 1 + n_tags - - -cdef int fill_moves(Move* moves, int n, list entity_types) except -1: - cdef Move* m - label_names = {'-': 0} - # Reserve class 0 - cdef int i = 0 - moves[i].clas = i - moves[i].action = MISSING - moves[i].label = 0 - i += 1 - for entity_type in entity_types: - moves[i].action = SHIFT - moves[i].label = label_names.setdefault(entity_type, len(label_names)) - moves[i].clas = i - i += 1 - moves[i].clas = i - moves[i].action = OUT - moves[i].label = 0 - i += 1 - moves[i].action = REDUCE - moves[i].clas = i - moves[i].label = 0 - i += 1 - - -cdef bint is_final(State* s): - return s.i == s.length and not entity_is_open(s) diff --git a/spacy/ner/pystate.pxd b/spacy/ner/pystate.pxd deleted file mode 100644 index 9293fae01..000000000 --- a/spacy/ner/pystate.pxd +++ /dev/null @@ -1,16 +0,0 @@ -from cymem.cymem cimport Pool - -from .structs cimport Move, State - - -cdef class PyState: - cdef Pool mem - cdef readonly list tag_names - cdef readonly int n_classes - cdef readonly dict moves_by_name - - cdef Move* _moves - cdef Move* _golds - cdef State* _s - - cdef Move* _get_move(self, unicode move_name) except NULL diff --git a/spacy/ner/pystate.pyx b/spacy/ner/pystate.pyx deleted file mode 100644 index ba18c2f07..000000000 --- a/spacy/ner/pystate.pyx +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import unicode_literals - -from ._state cimport init_state -from ._state cimport entity_is_open -from .bilou_moves cimport fill_moves -from .bilou_moves cimport transition -from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle -from .bilou_moves import get_n_moves -from .bilou_moves import ACTION_NAMES - - -cdef class PyState: - def __init__(self, tag_names, n_tokens): - self.mem = Pool() - self.tag_names = tag_names - self.n_classes = len(tag_names) - assert self.n_classes != 0 - self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) - fill_moves(self._moves, tag_names) - self._s = init_state(self.mem, n_tokens) - self._golds = self.mem.alloc(n_tokens, sizeof(Move)) - - cdef Move* _get_move(self, unicode move_name) except NULL: - return &self._moves[self.tag_names.index(move_name)] - - def set_golds(self, list gold_names): - cdef Move* m - for i, name in enumerate(gold_names): - m = self._get_move(name) - self._golds[i] = m[0] - - def transition(self, unicode move_name): - cdef Move* m = self._get_move(move_name) - transition(self._s, m) - - def is_valid(self, unicode move_name): - cdef Move* m = self._get_move(move_name) - set_accept_if_valid(self._moves, self.n_classes, self._s) - return m.accept - - def is_gold(self, unicode move_name): - cdef Move* m = self._get_move(move_name) - set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s) - return m.accept - - property ent: - def __get__(self): - return self._s.curr - - property n_ents: - def __get__(self): - return self._s.j - - property i: - def __get__(self): - return self._s.i - - property open_entity: - def __get__(self): - return entity_is_open(self._s) diff --git a/spacy/ner/structs.pxd b/spacy/ner/structs.pxd deleted file mode 100644 index 7d6ebed19..000000000 --- a/spacy/ner/structs.pxd +++ /dev/null @@ -1,23 +0,0 @@ -from thinc.typedefs cimport class_t - - -cdef struct Entity: - int start - int end - int label - - -cdef struct State: - Entity curr - Entity* ents - int* tags - int i - int j - int length - - -cdef struct Move: - class_t clas - int action - int label - bint accept diff --git a/spacy/pos_feats.pxd b/spacy/pos_feats.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/pos_feats.pyx b/spacy/pos_feats.pyx deleted file mode 100644 index e8a2699d4..000000000 --- a/spacy/pos_feats.pyx +++ /dev/null @@ -1,41 +0,0 @@ -from spacy.context cimport FIELD_IDS, Token - - -cpdef Token P2 = FIELD_IDS.P2 -cpdef Token P1 = FIELD_IDS.P1 -cpdef Token N0 = FIELD_IDS.N0 -cpdef Token N1 = FIELD_IDS.N1 -cpdef Token N2 = FIELD_IDS.N2 - - -TEMPLATES = ( - (N0.sic,), - (N0.norm,), - (N0.suffix,), - (N0.prefix,), - (P1.pos,), - (P2.pos,), - (P1.pos, P2.pos), - (P1.pos, N0.norm), - (P1.norm,), - (P1.suffix,), - (P2.norm,), - (N1.norm,), - (N1.suffix,), - (N2.norm,), - - (N0.shape,), - (N0.cluster,), - (N1.cluster,), - (N2.cluster,), - (P1.cluster,), - (P2.cluster,), - (N0.oft_upper,), - (N0.oft_title,), - - (N0.postype,), - - (P1.like_url,), - (N1.like_number,), - (N1.like_url,), -) diff --git a/spacy/pos_util.py b/spacy/pos_util.py deleted file mode 100644 index 489f03dde..000000000 --- a/spacy/pos_util.py +++ /dev/null @@ -1,153 +0,0 @@ -from __future__ import unicode_literals -from . import util -from . import tokens -from .en import EN - - -def read_gold(file_, tag_list, col): - paras = file_.read().strip().split('\n\n') - golds = [] - tag_ids = dict((tag, i) for i, tag in enumerate(tag_list)) - for para in paras: - if not para.strip(): - continue - lines = para.strip().split('\n') - raw = lines.pop(0) - gold_toks = lines.pop(0) - tokens = EN.tokenize(raw) - tags = [] - conll_toks = [] - for line in lines: - pieces = line.split() - conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col])) - for i, token in enumerate(tokens): - if not conll_toks: - tags.append('NULL') - elif token.idx == conll_toks[0][0]: - tags.append(conll_toks[0][2]) - conll_toks.pop(0) - elif token.idx < conll_toks[0]: - tags.append('NULL') - else: - conll_toks.pop(0) - assert len(tags) == len(tokens) - tags = [_encode_pos(t, tag_ids, tag_list) for t in tags] - golds.append((tokens, tags)) - return golds - -def _encode_pos(tag, tag_ids, tag_list): - if tag == '-': - return 0 - if tag not in tag_ids: - tag_ids[tag] = len(tag_list) - tag_list.append(tag) - return tag_ids[tag] - - -def ptb_to_univ(tag): - mapping = dict(tuple(line.split()) for line in """ -NULL NULL -HYPH . -ADD X -NFP . -AFX X -XX X -BES VERB -HVS VERB -GW X -! . -# . -$ . -'' . -( . -) . -, . --LRB- . --RRB- . -. . -: . -? . -CC CONJ -CD NUM -CD|RB X -DT DET -EX DET -FW X -IN ADP -IN|RP ADP -JJ ADJ -JJR ADJ -JJRJR ADJ -JJS ADJ -JJ|RB ADJ -JJ|VBG ADJ -LS X -MD VERB -NN NOUN -NNP NOUN -NNPS NOUN -NNS NOUN -NN|NNS NOUN -NN|SYM NOUN -NN|VBG NOUN -NP NOUN -PDT DET -POS PRT -PRP PRON -PRP$ PRON -PRP|VBP PRON -PRT PRT -RB ADV -RBR ADV -RBS ADV -RB|RP ADV -RB|VBG ADV -RN X -RP PRT -SYM X -TO PRT -UH X -VB VERB -VBD VERB -VBD|VBN VERB -VBG VERB -VBG|NN VERB -VBN VERB -VBP VERB -VBP|TO VERB -VBZ VERB -VP VERB -WDT DET -WH X -WP PRON -WP$ PRON -WRB ADV -! PRT -# X -$ NUM -& CONJ -, . -@ X -A ADJ -D DET -E X -G X -L PRT -M PRT -N NOUN -O PRON -P ADP -R ADV -S NOUN -T PRT -U X -V VERB -X PRT -Y PRT -Z NOUN -^ NOUN -~ X -`` . -EOL EOL""".strip().split('\n')) - return mapping[tag] - diff --git a/spacy/strings.pxd b/spacy/strings.pxd index abc1d5a28..9c16cfe1c 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -8,7 +8,7 @@ from .structs cimport Utf8Str, UniStr cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil: s.chars = &chars[start] s.n = end - start - s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) + s.key = hash64(s.chars, (s.n * sizeof(Py_UNICODE)), 0) cdef class StringStore: diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 0964f53c2..05c65c15f 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -1,6 +1,6 @@ from libc.stdint cimport uint8_t, uint32_t -from .typedefs cimport flags_t, attr_t, id_t, hash_t +from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t cdef struct Lexeme: @@ -34,7 +34,7 @@ cdef struct Morphology: cdef struct PosTag: Morphology morph int id - int pos + univ_tag_t pos cdef struct TokenC: diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 9e0426a29..ab8ce3962 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -2,7 +2,7 @@ from libc.stdint cimport uint32_t from cymem.cymem cimport Pool -from ..tokens cimport TokenC +from ..structs cimport TokenC cdef struct State: @@ -20,7 +20,8 @@ cdef int pop_stack(State *s) except -1 cdef int push_stack(State *s) except -1 -cdef bint has_head(const TokenC* t) nogil +cdef inline bint has_head(const TokenC* t) nogil: + return t.head != 0 cdef inline int get_idx(const State* s, const TokenC* t) nogil: @@ -70,29 +71,14 @@ cdef inline bint is_final(const State *s) nogil: return at_eol(s) # The stack will be attached to root anyway -cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1 -cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1 -cdef int children_in_stack(const State *s, const int head, const int* gold) except -1 -cdef int head_in_stack(const State *s, const int child, const int* gold) except -1 +cdef int children_in_buffer(const State *s, const int head, int* gold) except -1 +cdef int head_in_buffer(const State *s, const int child, int* gold) except -1 +cdef int children_in_stack(const State *s, const int head, int* gold) except -1 +cdef int head_in_stack(const State *s, const int child, int* gold) except -1 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL -cdef int count_left_kids(const TokenC* head) nogil - - -cdef int count_right_kids(const TokenC* head) nogil - - -# From https://en.wikipedia.org/wiki/Hamming_weight -cdef inline uint32_t _popcount(uint32_t x) nogil: - """Find number of non-zero bits.""" - cdef int count = 0 - while x != 0: - x &= x - 1 - count += 1 - return count - cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil: cdef int i diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx index e00e5f6a2..6bdfdea3e 100644 --- a/spacy/syntax/_state.pyx +++ b/spacy/syntax/_state.pyx @@ -3,32 +3,24 @@ from libc.string cimport memmove from cymem.cymem cimport Pool from ..lexeme cimport EMPTY_LEXEME -from ..tokens cimport TokenC - - -DEF PADDING = 5 -DEF NON_MONOTONIC = True cdef int add_dep(State *s, int head, int child, int label) except -1: - cdef int dist = head - child - s.sent[child].head = dist + s.sent[child].head = head - child s.sent[child].dep_tag = label # Keep a bit-vector tracking child dependencies. If a word has a child at # offset i from it, set that bit (tracking left and right separately) if child > head: - s.sent[head].r_kids |= 1 << (-dist) + s.sent[head].r_kids |= 1 << (-s.sent[child].head) else: - s.sent[head].l_kids |= 1 << dist + s.sent[head].l_kids |= 1 << s.sent[child].head cdef int pop_stack(State *s) except -1: assert s.stack_len >= 1 s.stack_len -= 1 s.stack -= 1 - if s.stack_len == 0 and not at_eol(s): - push_stack(s) - + cdef int push_stack(State *s) except -1: assert s.i < s.sent_len @@ -36,14 +28,9 @@ cdef int push_stack(State *s) except -1: s.stack[0] = s.i s.stack_len += 1 s.i += 1 - if at_eol(s): - while s.stack_len != 0: - if not has_head(get_s0(s)): - get_s0(s).dep_tag = 0 - pop_stack(s) -cdef int children_in_buffer(const State *s, int head, const int* gold) except -1: +cdef int children_in_buffer(const State *s, int head, int* gold) except -1: # Golds holds an array of head offsets --- the head of word i is i - golds[i] # Iterate over the tokens of the queue, and check whether their gold head is # our target @@ -55,21 +42,20 @@ cdef int children_in_buffer(const State *s, int head, const int* gold) except -1 return n -cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1: +cdef int head_in_buffer(const State *s, const int child, int* gold) except -1: return gold[child] >= s.i -cdef int children_in_stack(const State *s, const int head, const int* gold) except -1: +cdef int children_in_stack(const State *s, const int head, int* gold) except -1: cdef int i cdef int n = 0 for i in range(s.stack_len): if gold[s.stack[-i]] == head: - if NON_MONOTONIC or not has_head(get_s0(s)): - n += 1 + n += 1 return n -cdef int head_in_stack(const State *s, const int child, const int* gold) except -1: +cdef int head_in_stack(const State *s, const int child, int* gold) except -1: cdef int i for i in range(s.stack_len): if gold[child] == s.stack[-i]: @@ -86,7 +72,7 @@ cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) n if child >= s.sent: return child else: - return NULL + return s.sent - 1 cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil: @@ -98,20 +84,10 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) if child < (s.sent + s.sent_len): return child else: - return NULL + return s.sent - 1 -cdef bint has_head(const TokenC* t) nogil: - return t.head != 0 - - -cdef int count_left_kids(const TokenC* head) nogil: - return _popcount(head.l_kids) - - -cdef int count_right_kids(const TokenC* head) nogil: - return _popcount(head.r_kids) - +DEF PADDING = 5 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL: @@ -126,5 +102,4 @@ cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NUL s.stack_len = 0 s.i = 0 s.sent_len = sent_length - push_stack(s) return s diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index da8163e51..ee9d7b9a8 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -7,11 +7,8 @@ from ._state cimport State cdef struct Transition: - int clas int move int label - int cost - weight_t score cdef class TransitionSystem: @@ -21,8 +18,7 @@ cdef class TransitionSystem: cdef const Transition* _moves - cdef Transition best_valid(self, const weight_t* scores, const State* s) except * - cdef Transition best_gold(self, Transition* guess, const weight_t* scores, - const State* s, - const int* gold_heads, const int* gold_labels) except * + cdef Transition best_valid(self, const weight_t* scores, const State* s) except -1 + cdef Transition best_gold(self, const weight_t* scores, const State* s, + int* gold_heads, int* gold_labels) except -1 cdef int transition(self, State *s, const Transition* t) except -1 diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 33ec87919..2883aa403 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -7,8 +7,6 @@ from ._state cimport head_in_stack, children_in_stack from ..tokens cimport TokenC -DEF NON_MONOTONIC = True - cdef enum: SHIFT @@ -27,30 +25,22 @@ cdef inline bint _can_right(const State* s) nogil: cdef inline bint _can_left(const State* s) nogil: - if NON_MONOTONIC: - return s.stack_len >= 1 - else: - return s.stack_len >= 1 and not has_head(get_s0(s)) + return s.stack_len >= 1 and not has_head(get_s0(s)) cdef inline bint _can_reduce(const State* s) nogil: - if NON_MONOTONIC: - return s.stack_len >= 2 - else: - return s.stack_len >= 2 and has_head(get_s0(s)) + return s.stack_len >= 2 and has_head(get_s0(s)) -cdef int _shift_cost(const State* s, const int* gold) except -1: +cdef int _shift_cost(const State* s, int* gold) except -1: assert not at_eol(s) cost = 0 cost += head_in_stack(s, s.i, gold) cost += children_in_stack(s, s.i, gold) - if NON_MONOTONIC: - cost += gold[s.stack[0]] == s.i return cost -cdef int _right_cost(const State* s, const int* gold) except -1: +cdef int _right_cost(const State* s, int* gold) except -1: assert s.stack_len >= 1 cost = 0 if gold[s.i] == s.stack[0]: @@ -58,12 +48,10 @@ cdef int _right_cost(const State* s, const int* gold) except -1: cost += head_in_buffer(s, s.i, gold) cost += children_in_stack(s, s.i, gold) cost += head_in_stack(s, s.i, gold) - if NON_MONOTONIC: - cost += gold[s.stack[0]] == s.i return cost -cdef int _left_cost(const State* s, const int* gold) except -1: +cdef int _left_cost(const State* s, int* gold) except -1: assert s.stack_len >= 1 cost = 0 if gold[s.stack[0]] == s.i: @@ -71,17 +59,11 @@ cdef int _left_cost(const State* s, const int* gold) except -1: cost += head_in_buffer(s, s.stack[0], gold) cost += children_in_buffer(s, s.stack[0], gold) - if NON_MONOTONIC and s.stack_len >= 2: - cost += gold[s.stack[0]] == s.stack[-1] return cost -cdef int _reduce_cost(const State* s, const int* gold) except -1: - cdef int cost = 0 - cost += children_in_buffer(s, s.stack[0], gold) - if NON_MONOTONIC: - cost += head_in_buffer(s, s.stack[0], gold) - return cost +cdef int _reduce_cost(const State* s, int* gold) except -1: + return children_in_buffer(s, s.stack[0], gold) cdef class TransitionSystem: @@ -91,40 +73,38 @@ cdef class TransitionSystem: right_labels.sort() if 'ROOT' in right_labels: right_labels.pop(right_labels.index('ROOT')) + if 'dep' in right_labels: + right_labels.pop(right_labels.index('dep')) if 'ROOT' in left_labels: left_labels.pop(left_labels.index('ROOT')) + if 'dep' in left_labels: + left_labels.pop(left_labels.index('dep')) self.n_moves = 2 + len(left_labels) + len(right_labels) moves = self.mem.alloc(self.n_moves, sizeof(Transition)) cdef int i = 0 moves[i].move = SHIFT moves[i].label = 0 - moves[i].clas = i i += 1 moves[i].move = REDUCE moves[i].label = 0 - moves[i].clas = i i += 1 - self.label_ids = {'ROOT': 0} + self.label_ids = {'ROOT': 0, 'dep': -1} cdef int label_id for label_str in left_labels: label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) moves[i].move = LEFT moves[i].label = label_id - moves[i].clas = i i += 1 for label_str in right_labels: label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) moves[i].move = RIGHT moves[i].label = label_id - moves[i].clas = i i += 1 self._moves = moves - cdef int transition(self, State *s, const Transition* t) except -1: + cdef int transition(self, State *s, const int clas) except -1: + cdef const Transition* t = &self._moves[clas] if t.move == SHIFT: - # Set the dep label, in case we need it after we reduce - if NON_MONOTONIC: - get_s0(s).dep_tag = t.label push_stack(s) elif t.move == LEFT: add_dep(s, s.i, s.stack[0], t.label) @@ -133,12 +113,11 @@ cdef class TransitionSystem: add_dep(s, s.stack[0], s.i, t.label) push_stack(s) elif t.move == REDUCE: - add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag) pop_stack(s) else: raise StandardError(t.move) - cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: + cdef int best_valid(self, const weight_t* scores, const State* s) except -1: cdef bint[N_MOVES] valid valid[SHIFT] = _can_shift(s) valid[LEFT] = _can_left(s) @@ -147,61 +126,59 @@ cdef class TransitionSystem: cdef int best = -1 cdef weight_t score = 0 - cdef weight_t best_r_score = -9000 - cdef int best_r_label = -1 cdef int i for i in range(self.n_moves): if valid[self._moves[i].move] and (best == -1 or scores[i] > score): best = i score = scores[i] - if self._moves[i].move == RIGHT and scores[i] > best_r_score: - best_r_label = self._moves[i].label assert best >= 0 - cdef Transition t = self._moves[best] - t.score = score - if t.move == SHIFT: - t.label = best_r_label - return t + return best - cdef Transition best_gold(self, Transition* guess, const weight_t* scores, - const State* s, - const int* gold_heads, const int* gold_labels) except *: - # If we can create a gold dependency, only one action can be correct + cdef int best_gold(self, const weight_t* scores, const State* s, + int* gold_heads, int* gold_labels) except -1: cdef int[N_MOVES] unl_costs unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1 unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1 unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1 unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1 - guess.cost = unl_costs[guess.move] - cdef Transition t - cdef int target_label - cdef int i - if gold_heads[s.stack[0]] == s.i: - target_label = gold_labels[s.stack[0]] - if guess.move == LEFT: - guess.cost += guess.label != target_label - for i in range(self.n_moves): - t = self._moves[i] - if t.move == LEFT and t.label == target_label: - return t - elif gold_heads[s.i] == s.stack[0]: - target_label = gold_labels[s.i] - if guess.move == RIGHT: - guess.cost += guess.label != target_label - for i in range(self.n_moves): - t = self._moves[i] - if t.move == RIGHT and t.label == target_label: - return t - + cdef int cost + cdef int move + cdef int label cdef int best = -1 cdef weight_t score = -9000 + cdef int i for i in range(self.n_moves): - t = self._moves[i] - if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score): - best = i - score = scores[i] - t = self._moves[best] - t.score = score - assert best >= 0 - return t + move = self._moves[i].move + label = self._moves[i].label + if unl_costs[move] == 0: + if move == SHIFT or move == REDUCE: + cost = 0 + elif move == LEFT: + if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1: + cost = label != gold_labels[s.stack[0]] + else: + cost = 0 + elif move == RIGHT: + if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1: + cost = label != gold_labels[s.i] + else: + cost = 0 + else: + raise StandardError("Unknown Move") + if cost == 0 and (best == -1 or scores[i] > score): + best = i + score = scores[i] + + if best < 0: + print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT] + print s.stack_len + print has_head(get_s0(s)) + print s.sent[s.stack[0]].head + print s.stack[0], s.i + print gold_heads[s.stack[0]], gold_heads[s.i] + print gold_labels[s.i] + print children_in_buffer(s, s.stack[0], gold_heads) + print head_in_buffer(s, s.stack[0], gold_heads) + raise StandardError + return best diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 136c92d43..147bf0ce1 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -2,6 +2,8 @@ # cython: embedsignature=True from __future__ import unicode_literals +from os import path + from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc @@ -28,6 +30,17 @@ cdef class Tokenizer: self.vocab = Vocab(self.get_props) self._load_special_tokenization(rules) + @classmethod + def from_dir(cls, Vocab vocab, object data_dir): + if not path.exists(data_dir): + raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir) + if not path.isdir(data_dir): + raise IOError("Path %s is a file, not a dir -- cannot load Tokenizer." % data_dir) + + assert path.exists(data_dir) and path.isdir(data_dir) + rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir) + return cls(vocab, rules, prefix_re, suffix_re, infix_re) + cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) cdef Tokens tokens = Tokens(self.vocab.strings, length) diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 02d327b72..f91f55469 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -1,6 +1,26 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t from libc.stdint cimport uint8_t + +# Google universal tag set +cpdef enum univ_tag_t: + NO_TAG + ADJ + ADV + ADP + CONJ + DET + NOUN + NUM + PRON + PRT + VERB + X + PUNCT + EOL + N_UNIV_TAGS + + ctypedef uint64_t hash_t ctypedef char* utf8_t ctypedef uint32_t attr_t @@ -10,11 +30,3 @@ ctypedef uint16_t len_t ctypedef uint16_t tag_t -cdef struct Morphology: - uint8_t number - uint8_t tenspect # Tense/aspect/voice - uint8_t mood - uint8_t gender - uint8_t person - uint8_t case - uint8_t misc diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd deleted file mode 100644 index 5ef4113d5..000000000 --- a/spacy/utf8string.pxd +++ /dev/null @@ -1,34 +0,0 @@ -from preshed.maps cimport PreshMap -from cymem.cymem cimport Pool -from murmurhash.mrmr cimport hash64 - -from .typedefs cimport utf8_t, id_t, hash_t - - -cdef struct Utf8Str: - id_t i - hash_t key - utf8_t chars - int length - - -cdef struct UniStr: - Py_UNICODE* chars - size_t n - hash_t key - - -cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil: - s.chars = &chars[start] - s.n = end - start - s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) - - -cdef class StringStore: - cdef Pool mem - cdef PreshMap _map - cdef Utf8Str* strings - cdef int size - cdef int _resize_at - - cdef const Utf8Str* intern(self, char* chars, int length) except NULL diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx deleted file mode 100644 index 1d2b7a264..000000000 --- a/spacy/utf8string.pyx +++ /dev/null @@ -1,80 +0,0 @@ -from libc.string cimport memcpy - -from murmurhash.mrmr cimport hash64 -import codecs - -SEPARATOR = '\n|-SEP-|\n' - - -cdef class StringStore: - def __init__(self): - self.mem = Pool() - self._map = PreshMap() - self._resize_at = 10000 - self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) - self.size = 1 - - property size: - def __get__(self): - return self.size-1 - - def __getitem__(self, object string_or_id): - cdef bytes byte_string - cdef const Utf8Str* utf8str - if isinstance(string_or_id, int) or isinstance(string_or_id, long): - if string_or_id < 1 or string_or_id >= self.size: - raise IndexError(string_or_id) - utf8str = &self.strings[string_or_id] - return utf8str.chars[:utf8str.length] - elif isinstance(string_or_id, bytes): - utf8str = self.intern(string_or_id, len(string_or_id)) - return utf8str.i - elif isinstance(string_or_id, unicode): - byte_string = string_or_id.encode('utf8') - utf8str = self.intern(byte_string, len(byte_string)) - return utf8str.i - else: - raise TypeError(type(string_or_id)) - - cdef const Utf8Str* intern(self, char* chars, int length) except NULL: - # 0 means missing, but we don't bother offsetting the index. We waste - # slot 0 to simplify the code, because it doesn't matter. - assert length != 0 - cdef hash_t key = hash64(chars, length * sizeof(char), 0) - cdef void* value = self._map.get(key) - cdef size_t i - if value == NULL: - if self.size == self._resize_at: - self._resize_at *= 2 - self.strings = self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str)) - i = self.size - self.strings[i].i = self.size - self.strings[i].key = key - self.strings[i].chars = self.mem.alloc(length, sizeof(char)) - memcpy(self.strings[i].chars, chars, length) - self.strings[i].length = length - self._map.set(key, self.size) - self.size += 1 - else: - i = value - return &self.strings[i] - - def dump(self, loc): - strings = [] - cdef Utf8Str* string - cdef bytes py_string - for i in range(self.size): - string = &self.strings[i] - py_string = string.chars[:string.length] - strings.append(py_string.decode('utf8')) - with codecs.open(loc, 'w', 'utf8') as file_: - file_.write(SEPARATOR.join(strings)) - - def load(self, loc): - with codecs.open(loc, 'r', 'utf8') as file_: - strings = file_.read().split(SEPARATOR) - cdef unicode string - cdef bytes byte_string - for string in strings[1:]: - byte_string = string.encode('utf8') - self.intern(byte_string, len(byte_string)) diff --git a/spacy/util.py b/spacy/util.py index 1c25aeaf2..0bb5868ce 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -11,8 +11,7 @@ def utf8open(loc, mode='r'): return codecs.open(loc, mode, 'utf8') -def read_lang_data(name): - data_dir = path.join(DATA_DIR, name) +def read_lang_data(data_dir): with open(path.join(data_dir, 'specials.json')) as file_: tokenization = ujson.load(file_) prefix = read_prefix(data_dir) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 80f702572..abcee19b8 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,6 +19,17 @@ cdef class Vocab: self.lexemes.push_back(&EMPTY_LEXEME) self.get_lex_props = get_props + @classmethod + def from_dir(cls, object data_dir, object get_lex_props=None): + if not path.exists(data_dir): + raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) + if not path.isdir(data_dir): + raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + cdef Vocab self = cls(get_props) + self.strings.load(path.join(data_dir, 'strings')) + self.load(path.join(data_dir, 'lexemes')) + return self + def __len__(self): return self.lexemes.size()