From 4ecbe8c8938c04ac15cbcc4b572552d47e927524 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 5 Nov 2014 20:45:29 +1100 Subject: [PATCH] * Complete refactor of Tagger features, to use a generic list of context names. --- spacy/context.pxd | 56 +++++------ spacy/context.pyx | 52 ++++++---- spacy/lang.pxd | 1 + spacy/lang.pyx | 2 + spacy/ner_feats.pxd | 0 spacy/ner_feats.pyx | 35 +++++++ spacy/pos.pyx | 229 -------------------------------------------- spacy/pos_feats.pxd | 83 ---------------- spacy/pos_feats.pyx | 102 +++++++------------- spacy/pos_util.py | 6 +- spacy/tagger.pxd | 5 +- spacy/tagger.pyx | 23 ++--- spacy/tokens.pxd | 3 + spacy/tokens.pyx | 19 +++- 14 files changed, 166 insertions(+), 450 deletions(-) create mode 100644 spacy/ner_feats.pxd create mode 100644 spacy/ner_feats.pyx delete mode 100644 spacy/pos.pyx diff --git a/spacy/context.pxd b/spacy/context.pxd index 9d7cf000f..47aedb3a4 100644 --- a/spacy/context.pxd +++ b/spacy/context.pxd @@ -4,40 +4,42 @@ from .tokens cimport Tokens from .lexeme cimport Lexeme -cdef struct Token: - atom_t i - atom_t c - atom_t w - atom_t shape - atom_t pref - atom_t suff - atom_t oft_title - atom_t oft_upper - atom_t is_alpha - atom_t is_digit - atom_t is_title - atom_t is_upper +cdef class Token: + cdef readonly atom_t i + cdef readonly atom_t c + cdef readonly atom_t w + cdef readonly atom_t shape + cdef readonly atom_t pref + cdef readonly atom_t suff + cdef readonly atom_t oft_title + cdef readonly atom_t oft_upper + cdef readonly atom_t is_alpha + cdef readonly atom_t is_digit + cdef readonly atom_t is_title + cdef readonly atom_t is_upper - atom_t url - atom_t num + cdef readonly atom_t url + cdef readonly atom_t num - atom_t postype - atom_t pos - atom_t ner + cdef readonly atom_t postype + cdef readonly atom_t pos + cdef readonly atom_t ner -cdef struct Slots: - Token P2 - Token P1 - Token N0 - Token N1 - Token N2 +cdef class Slots: + cdef readonly Token P2 + cdef readonly Token P1 + cdef readonly Token N0 + cdef readonly Token N1 + cdef readonly Token N2 -cdef Slots FIELD_IDS cdef int N_FIELDS -cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0 +cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0 -cdef int fill_flat(atom_t* context, Slots* s) except -1 +cdef int fill_flat(atom_t* context, Slots s) except -1 + + +cpdef Slots FIELD_IDS diff --git a/spacy/context.pyx b/spacy/context.pyx index 465dd32c3..d715f2e5e 100644 --- a/spacy/context.pyx +++ b/spacy/context.pyx @@ -2,7 +2,16 @@ from murmurhash.mrmr cimport hash64 from .lexeme cimport * -cdef void _number_token(Token* t, int* n_fields): +cdef class Slots: + def __init__(self): + self.P2 = Token() + self.P1 = Token() + self.N0 = Token() + self.N1 = Token() + self.N2 = Token() + + +cdef void _number_token(Token t, int* n_fields): cdef int i = n_fields[0] t.i = i; i += 1 t.c = i; i += 1 @@ -27,7 +36,7 @@ cdef void _number_token(Token* t, int* n_fields): n_fields[0] = i -cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner): +cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner): t.i = lex.sic t.c = lex.cluster t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape @@ -48,7 +57,7 @@ cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner): t.ner = ner -cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1: +cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1: context[ids.i] = vals.i context[ids.c] = vals.c context[ids.w] = vals.w @@ -68,26 +77,27 @@ cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1: context[ids.ner] = vals.ner -cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0: - fill_token(&s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2]) - fill_token(&s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1]) - fill_token(&s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i]) - fill_token(&s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1]) - fill_token(&s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2]) - return hash64(s, sizeof(Slots), 0) +cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0: + fill_token(s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2]) + fill_token(s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1]) + fill_token(s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i]) + fill_token(s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1]) + fill_token(s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2]) + return 1 -cdef int fill_flat(atom_t* context, Slots* s) except -1: - _flatten_token(context, &FIELD_IDS.P2, &s.P2) - _flatten_token(context, &FIELD_IDS.P1, &s.P1) - _flatten_token(context, &FIELD_IDS.N0, &s.N0) - _flatten_token(context, &FIELD_IDS.N1, &s.N1) - _flatten_token(context, &FIELD_IDS.N2, &s.N2) +cdef int fill_flat(atom_t* context, Slots s) except -1: + _flatten_token(context, FIELD_IDS.P2, s.P2) + _flatten_token(context, FIELD_IDS.P1, s.P1) + _flatten_token(context, FIELD_IDS.N0, s.N0) + _flatten_token(context, FIELD_IDS.N1, s.N1) + _flatten_token(context, FIELD_IDS.N2, s.N2) N_FIELDS = 0 -_number_token(&FIELD_IDS.P2, &N_FIELDS) -_number_token(&FIELD_IDS.P1, &N_FIELDS) -_number_token(&FIELD_IDS.N0, &N_FIELDS) -_number_token(&FIELD_IDS.N1, &N_FIELDS) -_number_token(&FIELD_IDS.N2, &N_FIELDS) +FIELD_IDS = Slots() +_number_token(FIELD_IDS.P2, &N_FIELDS) +_number_token(FIELD_IDS.P1, &N_FIELDS) +_number_token(FIELD_IDS.N0, &N_FIELDS) +_number_token(FIELD_IDS.N1, &N_FIELDS) +_number_token(FIELD_IDS.N2, &N_FIELDS) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index dd5dd7f02..7283cfb47 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -42,6 +42,7 @@ cdef class Language: cpdef readonly Lexicon lexicon cpdef readonly Tagger pos_tagger + cpdef readonly Tagger ner_tagger cdef object _prefix_re cdef object _suffix_re diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 4e61e1fef..9a2eafd8e 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -45,6 +45,8 @@ cdef class Language: self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos')) else: self.pos_tagger = None + if path.exists(path.join(util.DATA_DIR, name, 'ner')): + self.ner_tagger = Tagger(path.join(util.DATA_DIR, name, 'ner')) cpdef Tokens tokenize(self, unicode string): """Tokenize a string. diff --git a/spacy/ner_feats.pxd b/spacy/ner_feats.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner_feats.pyx b/spacy/ner_feats.pyx new file mode 100644 index 000000000..870295766 --- /dev/null +++ b/spacy/ner_feats.pyx @@ -0,0 +1,35 @@ +from spacy.context cimport FIELD_IDS, Token + + +cdef Token P2 = FIELD_IDS.P2 +cdef Token P1 = FIELD_IDS.P1 +cdef Token N0 = FIELD_IDS.N0 +cdef Token N1 = FIELD_IDS.N1 +cdef Token N2 = FIELD_IDS.N2 + + +TEMPLATES = ( + (N0.i,), + (N0.c,), + + (P1.pos,), + (P1.i,), + + (N1.w,), + (N1.pos,), + + (P1.ner,), + (P2.ner,), + + (N0.c,), + (P1.c,), + (N1.c,), + + (N0.is_alpha,), + (N0.is_digit,), + (N0.is_title,), + (N0.is_upper,), + + (N0.is_title, N0.oft_title), + (N0.is_upper, N0.oft_upper), +) diff --git a/spacy/pos.pyx b/spacy/pos.pyx deleted file mode 100644 index 19459d969..000000000 --- a/spacy/pos.pyx +++ /dev/null @@ -1,229 +0,0 @@ -# cython: profile=True -from os import path -import os -import shutil -import ujson -import random -import codecs -import gzip -import cython - -from libc.stdint cimport uint32_t - - -from thinc.weights cimport arg_max -from thinc.features import NonZeroConjFeat -from thinc.features import ConjFeat - -from .lexeme cimport * -from .lang cimport Lexicon - - -NULL_TAG = 0 - - -cdef class Tagger: - tags = {'NULL': NULL_TAG} - def __init__(self, model_dir): - self.mem = Pool() - tags_loc = path.join(model_dir, 'postags.json') - if path.exists(tags_loc): - with open(tags_loc) as file_: - Tagger.tags.update(ujson.load(file_)) - self.model = LinearModel(len(self.tags)) - if path.exists(path.join(model_dir, 'model')): - self.model.load(path.join(model_dir, 'model')) - self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES]) - self._atoms = self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t)) - self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) - self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) - self._scores = self.mem.alloc(len(self.tags), sizeof(weight_t)) - self._guess = NULL_TAG - - cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0: - get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i], - tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev) - self.extractor.extract(self._feats, self._values, self._atoms, NULL) - self._guess = self.model.score(self._scores, self._feats, self._values) - return self._guess - - cpdef bint tell_answer(self, class_t gold) except *: - cdef class_t guess = self._guess - if gold == guess or gold == NULL_TAG: - self.model.update({}) - return 0 - counts = {guess: {}, gold: {}} - self.extractor.count(counts[gold], self._feats, 1) - self.extractor.count(counts[guess], self._feats, -1) - self.model.update(counts) - - @classmethod - def encode_pos(cls, tag): - if tag not in cls.tags: - cls.tags[tag] = len(cls.tags) - return cls.tags[tag] - - -@cython.boundscheck(False) -def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts): - cdef class_t prev_prev, prev, tag - prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL'] - cdef int i - cdef id_t token - for i in range(tokens.length): - tag = tagger.predict(i, tokens, prev, prev_prev) - prev_prev = prev - prev = tag - token = tokens.lex[i].id - if token < tag_counts.shape[0]: - tag_counts[token, tag] += 1 - - -cpdef enum: - P2i - P2c - P2w - P2shape - P2pref - P2suff - P2title - P2upper - P2oft_title - P2oft_upper - P2pos - P2url - P2num - - P1i - P1c - P1w - P1shape - P1pre - P1suff - P1title - P1upper - P1oft_title - P1oft_upper - P1pos - P1url - P1num - - N0i - N0c - N0w - N0shape - N0pref - N0suff - N0title - N0upper - N0oft_title - N0oft_upper - N0pos - N0url - N0num - - N1i - N1c - N1w - N1shape - N1pref - N1suff - N1title - N1upper - N1oft_title - N1oft_upper - N1pos - N1url - N1num - - N2i - N2c - N2w - N2shape - N2pref - N2suff - N2title - N2upper - N2oft_title - N2oft_upper - N2pos - N2url - N2num - - P2t - P1t - - CONTEXT_SIZE - - -cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1, - Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1: - _fill_token(&atoms[P2i], p2) - _fill_token(&atoms[P1i], p1) - _fill_token(&atoms[N0i], n0) - _fill_token(&atoms[N1i], n1) - _fill_token(&atoms[N2i], n2) - atoms[P1t] = prev_tag - atoms[P2t] = prev_prev_tag - - -cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil: - atoms[0] = lex.sic - atoms[1] = lex.cluster - atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape - atoms[3] = lex.shape - atoms[4] = lex.prefix - atoms[5] = lex.suffix - - atoms[6] = lex.flags & (1 << IS_TITLE) - atoms[7] = lex.flags & (1 << IS_UPPER) - atoms[8] = lex.flags & (1 << OFT_TITLE) - atoms[9] = lex.flags & (1 << OFT_UPPER) - atoms[10] = lex.postype - atoms[11] = lex.flags & (1 << LIKE_URL) - atoms[12] = lex.flags & (1 << LIKE_NUMBER) - -TEMPLATES = ( - (N0i,), - (N0w,), - (N0suff,), - (N0pref,), - (P1t,), - (P2t,), - (P1t, P2t), - (P1t, N0w), - (P1w,), - (P1suff,), - (P2w,), - (N1w,), - (N1suff,), - (N2w,), - - (N0shape,), - (N0c,), - (N1c,), - (N2c,), - (P1c,), - (P2c,), - (P1c, N0c), - (N0c, N1c), - (P1c, P1t), - (P1c, P1t, N0c), - (P1t, N0c), - (N0oft_upper,), - (N0oft_title,), - - (P1w, N0w), - (N0w, N1w), - - (N0pos,), - (P1t, N0pos, N1pos), - (P1t, N1pos), - - (N0url,), - (N0num,), - (P1url,), - (P1url,), - (N1num,), - (N1url,), -) diff --git a/spacy/pos_feats.pxd b/spacy/pos_feats.pxd index 0a13e9416..e69de29bb 100644 --- a/spacy/pos_feats.pxd +++ b/spacy/pos_feats.pxd @@ -1,83 +0,0 @@ -from .tokens cimport Tokens -from thinc.typedefs cimport atom_t - - -cpdef enum: - P2i - P2c - P2w - P2shape - P2pref - P2suff - P2title - P2upper - P2oft_title - P2oft_upper - P2pos - P2url - P2num - - P1i - P1c - P1w - P1shape - P1pre - P1suff - P1title - P1upper - P1oft_title - P1oft_upper - P1pos - P1url - P1num - - N0i - N0c - N0w - N0shape - N0pref - N0suff - N0title - N0upper - N0oft_title - N0oft_upper - N0pos - N0url - N0num - - N1i - N1c - N1w - N1shape - N1pref - N1suff - N1title - N1upper - N1oft_title - N1oft_upper - N1pos - N1url - N1num - - N2i - N2c - N2w - N2shape - N2pref - N2suff - N2title - N2upper - N2oft_title - N2oft_upper - N2pos - N2url - N2num - - P2t - P1t - - CONTEXT_SIZE - - - -cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1 diff --git a/spacy/pos_feats.pyx b/spacy/pos_feats.pyx index 89cdebab9..600d75d3d 100644 --- a/spacy/pos_feats.pyx +++ b/spacy/pos_feats.pyx @@ -1,77 +1,41 @@ -from .lexeme cimport * +from spacy.context cimport FIELD_IDS, Token -from thinc.typedefs cimport atom_t + +cpdef Token P2 = FIELD_IDS.P2 +cpdef Token P1 = FIELD_IDS.P1 +cpdef Token N0 = FIELD_IDS.N0 +cpdef Token N1 = FIELD_IDS.N1 +cpdef Token N2 = FIELD_IDS.N2 TEMPLATES = ( - (N0i,), - (N0w,), - (N0suff,), - (N0pref,), - (P1t,), - (P2t,), - (P1t, P2t), - (P1t, N0w), - (P1w,), - (P1suff,), - (P2w,), - (N1w,), - (N1suff,), - (N2w,), + (N0.i,), + (N0.w,), + (N0.suff,), + (N0.pref,), + (P1.pos,), + (P2.pos,), + (P1.pos, P2.pos), + (P1.pos, N0.w), + (P1.w,), + (P1.suff,), + (P2.w,), + (N1.w,), + (N1.suff,), + (N2.w,), - (N0shape,), - (N0c,), - (N1c,), - (N2c,), - (P1c,), - (P2c,), - (P1c, N0c), - (N0c, N1c), - (P1c, P1t), - (P1c, P1t, N0c), - (P1t, N0c), - (N0oft_upper,), - (N0oft_title,), + (N0.shape,), + (N0.c,), + (N1.c,), + (N2.c,), + (P1.c,), + (P2.c,), + (N0.oft_upper,), + (N0.oft_title,), - (P1w, N0w), - (N0w, N1w), + (N0.postype,), - (N0pos,), - (P1t, N0pos, N1pos), - (P1t, N1pos), - - (N0url,), - (N0num,), - (P1url,), - (P1url,), - (N1num,), - (N1url,), + (P1.url,), + (N1.num,), + (N1.url,), ) - - -cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1: - _fill_token(&context[P2i], tokens.lex[i-2]) - _fill_token(&context[P1i], tokens.lex[i-1]) - _fill_token(&context[N0i], tokens.lex[i]) - _fill_token(&context[N1i], tokens.lex[i+1]) - _fill_token(&context[N2i], tokens.lex[i+2]) - context[P1t] = tokens.pos[i-1] - context[P2t] = tokens.pos[i-2] - - -cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil: - atoms[0] = lex.sic - atoms[1] = lex.cluster - atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape - atoms[3] = lex.shape - atoms[4] = lex.prefix - atoms[5] = lex.suffix - - atoms[6] = lex.flags & (1 << IS_TITLE) - atoms[7] = lex.flags & (1 << IS_UPPER) - atoms[8] = lex.flags & (1 << OFT_TITLE) - atoms[9] = lex.flags & (1 << OFT_UPPER) - atoms[10] = lex.postype - atoms[11] = lex.flags & (1 << LIKE_URL) - atoms[12] = lex.flags & (1 << LIKE_NUMBER) - diff --git a/spacy/pos_util.py b/spacy/pos_util.py index eac464707..039e6b15d 100644 --- a/spacy/pos_util.py +++ b/spacy/pos_util.py @@ -3,10 +3,8 @@ from . import util from . import tokens from .en import EN -from .pos import Tagger - -def read_gold(file_, tag_list): +def read_gold(file_, tag_list, col): paras = file_.read().strip().split('\n\n') golds = [] tag_ids = dict((tag, i) for i, tag in enumerate(tag_list)) @@ -21,7 +19,7 @@ def read_gold(file_, tag_list): conll_toks = [] for line in lines: pieces = line.split() - conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[3])) + conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col])) for i, token in enumerate(tokens): if not conll_toks: tags.append('NULL') diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index ca5fbd6ee..e8f6357b0 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -4,6 +4,8 @@ from thinc.learner cimport LinearModel from thinc.features cimport Extractor from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t +from .typedefs cimport hash_t +from .context cimport Slots from .tokens cimport Tokens @@ -26,7 +28,8 @@ cdef class Tagger: cpdef readonly list tag_names cdef class_t _guess - cdef atom_t* _context + cdef atom_t* _context_flat + cdef Slots _context_slots cdef feat_t* _feats cdef weight_t* _values cdef weight_t* _scores diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 22aa4814d..bca5dcc7c 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -10,8 +10,9 @@ import random import json import cython -from .pos_feats cimport fill_context as pos_fill_context -from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE +from .context cimport fill_slots +from .context cimport fill_flat +from .context cimport N_FIELDS from thinc.features cimport ConjFeat @@ -46,6 +47,7 @@ def train(train_sents, model_dir, nr_iter=5): if gold != NULL_TAG: total += 1 n_corr += guess == gold + #print('%s\t%d\t%d' % (tokens[i].string, guess, gold)) print('%.4f' % ((n_corr / total) * 100)) random.shuffle(train_sents) tagger.model.end_training() @@ -76,15 +78,12 @@ cdef class Tagger: self.tag_names = cfg['tag_names'] self.tag_type = cfg['tag_type'] self.extractor = Extractor(templates, [ConjFeat] * len(templates)) - self.model = LinearModel(len(self.tag_names), self.extractor.n) - print("Load tagger model") + self.model = LinearModel(len(self.tag_names)) if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) - print("Done") - if self.tag_type == POS: - n_context = POS_CONTEXT_SIZE - self._context = self.mem.alloc(n_context, sizeof(atom_t)) + self._context_flat = self.mem.alloc(N_FIELDS, sizeof(atom_t)) + self._context_slots = Slots() self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t)) @@ -110,11 +109,9 @@ cdef class Tagger: >>> tag = EN.pos_tagger.predict(0, tokens) >>> assert tag == EN.pos_tagger.tag_id('DT') == 5 """ - if self.tag_type == POS: - pos_fill_context(self._context, i, tokens) - else: - raise StandardError - self.extractor.extract(self._feats, self._values, self._context, NULL) + cdef hash_t hashed = fill_slots(self._context_slots, i, tokens) + fill_flat(self._context_flat, self._context_slots) + self.extractor.extract(self._feats, self._values, self._context_flat, NULL) self._guess = self.model.score(self._scores, self._feats, self._values) return self._guess diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 529161035..d1b2ef10b 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -15,9 +15,11 @@ cdef class Tokens: cdef Lexeme** _lex_ptr cdef int* _idx_ptr cdef int* _pos_ptr + cdef int* _ner_ptr cdef Lexeme** lex cdef int* idx cdef int* pos + cdef int* ner cdef int length cdef int max_length @@ -32,6 +34,7 @@ cdef class Token: cdef public int i cdef public int idx cdef public int pos + cdef public int ner cdef public atom_t id cdef public atom_t cluster diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 99b022a27..721e6bb80 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,6 +1,7 @@ # cython: profile=True from .lexeme cimport * cimport cython +from .tagger cimport POS, ENTITY DEF PADDING = 5 @@ -44,21 +45,25 @@ cdef class Tokens: self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) self._idx_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) self._pos_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) + self._ner_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) self.lex = self._lex_ptr self.idx = self._idx_ptr self.pos = self._pos_ptr + self.ner = self._ner_ptr cdef int i for i in range(size + (PADDING*2)): self.lex[i] = &EMPTY_LEXEME self.lex += PADDING self.idx += PADDING self.pos += PADDING + self.ner += PADDING self.max_length = size self.length = 0 def __getitem__(self, i): bounds_check(i, self.length, PADDING) - return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0]) + return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i], + self.lex[i][0]) def __iter__(self): for i in range(self.length): @@ -73,6 +78,7 @@ cdef class Tokens: self.lex[self.length] = lexeme self.idx[self.length] = idx self.pos[self.length] = 0 + self.ner[self.length] = 0 self.length += 1 return idx + lexeme.length @@ -91,7 +97,10 @@ cdef class Tokens: return idx cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1: - self.pos[i] = tag + if tag_type == POS: + self.pos[i] = tag + elif tag_type == ENTITY: + self.ner[i] = tag def _realloc(self, new_size): self.max_length = new_size @@ -99,19 +108,23 @@ cdef class Tokens: self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int)) self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int)) + self._ner_ptr = self.mem.realloc(self._ner_ptr, n * sizeof(int)) self.lex = self._lex_ptr + PADDING self.idx = self._idx_ptr + PADDING self.pos = self._pos_ptr + PADDING + self.ner = self._ner_ptr + PADDING for i in range(self.length, self.max_length + PADDING): self.lex[i] = &EMPTY_LEXEME @cython.freelist(64) cdef class Token: - def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex): + def __init__(self, StringStore string_store, int i, int idx, int pos, int ner, + dict lex): self._string_store = string_store self.idx = idx self.pos = pos + self.ner = ner self.i = i self.id = lex['id']