* Update tokens.pyx as part of reorg

This commit is contained in:
Matthew Honnibal 2014-12-20 07:03:26 +11:00
parent 116f7f3bc1
commit 4c6ce7ee84
2 changed files with 24 additions and 53 deletions

View File

@ -4,27 +4,12 @@ import numpy as np
cimport numpy as np cimport numpy as np
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .lexeme cimport Lexeme from .structs cimport Lexeme, TokenC, Morphology
from .typedefs cimport flags_t from .typedefs cimport flags_t, attr_t, flags_t
from .typedefs cimport Morphology
from .lang cimport Language
from .strings cimport StringStore
cdef struct TokenC:
const Lexeme* lex
Morphology morph
int idx
int pos
int lemma
int sense
int head
int dep_tag
uint32_t l_kids
uint32_t r_kids
ctypedef const Lexeme* const_Lexeme_ptr ctypedef const Lexeme* const_Lexeme_ptr
@ -37,7 +22,7 @@ ctypedef fused LexemeOrToken:
cdef class Tokens: cdef class Tokens:
cdef Pool mem cdef Pool mem
cdef Language lang cdef StringStore strings
cdef list tag_names cdef list tag_names
cdef TokenC* data cdef TokenC* data
@ -51,7 +36,7 @@ cdef class Tokens:
cdef class Token: cdef class Token:
cdef public Language lang cdef public StringStore strings
cdef public int i cdef public int i
cdef public int idx cdef public int idx
cdef int pos cdef int pos
@ -59,18 +44,18 @@ cdef class Token:
cdef public int head cdef public int head
cdef public int dep_tag cdef public int dep_tag
cdef public atom_t id cdef public attr_t id
cdef public atom_t cluster cdef public attr_t cluster
cdef public atom_t length cdef public attr_t length
cdef public atom_t postype cdef public attr_t postype
cdef public atom_t sensetype cdef public attr_t sensetype
cdef public atom_t sic cdef public attr_t sic
cdef public atom_t norm cdef public attr_t norm
cdef public atom_t shape cdef public attr_t shape
cdef public atom_t asciied cdef public attr_t asciied
cdef public atom_t prefix cdef public attr_t prefix
cdef public atom_t suffix cdef public attr_t suffix
cdef public float prob cdef public float prob

View File

@ -2,7 +2,7 @@
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter from preshed.counter cimport PreshCounter
from .lexeme cimport * from .lexeme cimport get_attr, EMPTY_LEXEME, LEMMA, attr_id_t
cimport cython cimport cython
import numpy as np import numpy as np
@ -30,8 +30,8 @@ cdef class Tokens:
>>> from spacy.en import EN >>> from spacy.en import EN
>>> tokens = EN.tokenize('An example sentence.') >>> tokens = EN.tokenize('An example sentence.')
""" """
def __init__(self, Language lang, string_length=0): def __init__(self, StringStore string_store, string_length=0):
self.lang = lang self.string_store = string_store
if string_length >= 3: if string_length >= 3:
size = int(string_length / 3.0) size = int(string_length / 3.0)
else: else:
@ -50,7 +50,7 @@ cdef class Tokens:
def __getitem__(self, i): def __getitem__(self, i):
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token(self.lang, i, self.data[i].idx, self.data[i].pos, return Token(self.string_store, i, self.data[i].idx, self.data[i].pos,
self.data[i].lemma, self.data[i].head, self.data[i].dep_tag, self.data[i].lemma, self.data[i].head, self.data[i].dep_tag,
self.data[i].lex[0]) self.data[i].lex[0])
@ -97,20 +97,6 @@ cdef class Tokens:
counts.inc(attr, 1) counts.inc(attr, 1)
return dict(counts) return dict(counts)
def base_nps(self):
# Iterate backwards, looking for nouns, and if we're collecting, for an
# outside-NP word. We want greedy matching, so it's easier to find the noun.
cdef TokenC* token
cdef int end = -1
for i in range(self.length-1, -1, -1):
token = &self.data[i]
if end == -1:
if self.lang.is_base_np_end(token):
end = i
elif self.lang.is_outside_base_np(token):
yield i-1, end
end = -1
def _realloc(self, new_size): def _realloc(self, new_size):
self.max_length = new_size self.max_length = new_size
n = new_size + (PADDING * 2) n = new_size + (PADDING * 2)
@ -129,9 +115,9 @@ cdef class Tokens:
@cython.freelist(64) @cython.freelist(64)
cdef class Token: cdef class Token:
def __init__(self, Language lang, int i, int idx, def __init__(self, StringStore string_store, int i, int idx,
int pos, int lemma, int head, int dep_tag, dict lex): int pos, int lemma, int head, int dep_tag, dict lex):
self.lang = lang self.string_store = string_store
self.idx = idx self.idx = idx
self.pos = pos self.pos = pos
self.i = i self.i = i
@ -158,14 +144,14 @@ cdef class Token:
def __get__(self): def __get__(self):
if self.sic == 0: if self.sic == 0:
return '' return ''
cdef bytes utf8string = self.lang.lexicon.strings[self.sic] cdef bytes utf8string = self.string_store[self.sic]
return utf8string.decode('utf8') return utf8string.decode('utf8')
property lemma: property lemma:
def __get__(self): def __get__(self):
if self.lemma == 0: if self.lemma == 0:
return self.string return self.string
cdef bytes utf8string = self.lang.lexicon.strings[self.lemma] cdef bytes utf8string = self.string_store[self.lemma]
return utf8string.decode('utf8') return utf8string.decode('utf8')
property pos: property pos: