From 4c6ce7ee84acd4f2f47eaddb453ffc8786a41070 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Dec 2014 07:03:26 +1100 Subject: [PATCH] * Update tokens.pyx as part of reorg --- spacy/tokens.pxd | 47 ++++++++++++++++------------------------------- spacy/tokens.pyx | 30 ++++++++---------------------- 2 files changed, 24 insertions(+), 53 deletions(-) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 9a0e09f92..12eb70cc1 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -4,27 +4,12 @@ import numpy as np cimport numpy as np from cymem.cymem cimport Pool -from thinc.typedefs cimport atom_t -from .lexeme cimport Lexeme +from .structs cimport Lexeme, TokenC, Morphology -from .typedefs cimport flags_t -from .typedefs cimport Morphology -from .lang cimport Language +from .typedefs cimport flags_t, attr_t, flags_t - - -cdef struct TokenC: - const Lexeme* lex - Morphology morph - int idx - int pos - int lemma - int sense - int head - int dep_tag - uint32_t l_kids - uint32_t r_kids +from .strings cimport StringStore ctypedef const Lexeme* const_Lexeme_ptr @@ -37,7 +22,7 @@ ctypedef fused LexemeOrToken: cdef class Tokens: cdef Pool mem - cdef Language lang + cdef StringStore strings cdef list tag_names cdef TokenC* data @@ -51,7 +36,7 @@ cdef class Tokens: cdef class Token: - cdef public Language lang + cdef public StringStore strings cdef public int i cdef public int idx cdef int pos @@ -59,18 +44,18 @@ cdef class Token: cdef public int head cdef public int dep_tag - cdef public atom_t id - cdef public atom_t cluster - cdef public atom_t length - cdef public atom_t postype - cdef public atom_t sensetype + cdef public attr_t id + cdef public attr_t cluster + cdef public attr_t length + cdef public attr_t postype + cdef public attr_t sensetype - cdef public atom_t sic - cdef public atom_t norm - cdef public atom_t shape - cdef public atom_t asciied - cdef public atom_t prefix - cdef public atom_t suffix + cdef public attr_t sic + cdef public attr_t norm + cdef public attr_t shape + cdef public attr_t asciied + cdef public attr_t prefix + cdef public attr_t suffix cdef public float prob diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index e0d320b30..f4b1c952d 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -2,7 +2,7 @@ from preshed.maps cimport PreshMap from preshed.counter cimport PreshCounter -from .lexeme cimport * +from .lexeme cimport get_attr, EMPTY_LEXEME, LEMMA, attr_id_t cimport cython import numpy as np @@ -30,8 +30,8 @@ cdef class Tokens: >>> from spacy.en import EN >>> tokens = EN.tokenize('An example sentence.') """ - def __init__(self, Language lang, string_length=0): - self.lang = lang + def __init__(self, StringStore string_store, string_length=0): + self.string_store = string_store if string_length >= 3: size = int(string_length / 3.0) else: @@ -50,7 +50,7 @@ cdef class Tokens: def __getitem__(self, i): bounds_check(i, self.length, PADDING) - return Token(self.lang, i, self.data[i].idx, self.data[i].pos, + return Token(self.string_store, i, self.data[i].idx, self.data[i].pos, self.data[i].lemma, self.data[i].head, self.data[i].dep_tag, self.data[i].lex[0]) @@ -97,20 +97,6 @@ cdef class Tokens: counts.inc(attr, 1) return dict(counts) - def base_nps(self): - # Iterate backwards, looking for nouns, and if we're collecting, for an - # outside-NP word. We want greedy matching, so it's easier to find the noun. - cdef TokenC* token - cdef int end = -1 - for i in range(self.length-1, -1, -1): - token = &self.data[i] - if end == -1: - if self.lang.is_base_np_end(token): - end = i - elif self.lang.is_outside_base_np(token): - yield i-1, end - end = -1 - def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) @@ -129,9 +115,9 @@ cdef class Tokens: @cython.freelist(64) cdef class Token: - def __init__(self, Language lang, int i, int idx, + def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma, int head, int dep_tag, dict lex): - self.lang = lang + self.string_store = string_store self.idx = idx self.pos = pos self.i = i @@ -158,14 +144,14 @@ cdef class Token: def __get__(self): if self.sic == 0: return '' - cdef bytes utf8string = self.lang.lexicon.strings[self.sic] + cdef bytes utf8string = self.string_store[self.sic] return utf8string.decode('utf8') property lemma: def __get__(self): if self.lemma == 0: return self.string - cdef bytes utf8string = self.lang.lexicon.strings[self.lemma] + cdef bytes utf8string = self.string_store[self.lemma] return utf8string.decode('utf8') property pos: