mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-10 00:20:35 +03:00
* Update tokens.pyx as part of reorg
This commit is contained in:
parent
116f7f3bc1
commit
4c6ce7ee84
|
@ -4,27 +4,12 @@ import numpy as np
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport atom_t
|
|
||||||
|
|
||||||
from .lexeme cimport Lexeme
|
from .structs cimport Lexeme, TokenC, Morphology
|
||||||
|
|
||||||
from .typedefs cimport flags_t
|
from .typedefs cimport flags_t, attr_t, flags_t
|
||||||
from .typedefs cimport Morphology
|
|
||||||
from .lang cimport Language
|
|
||||||
|
|
||||||
|
from .strings cimport StringStore
|
||||||
|
|
||||||
cdef struct TokenC:
|
|
||||||
const Lexeme* lex
|
|
||||||
Morphology morph
|
|
||||||
int idx
|
|
||||||
int pos
|
|
||||||
int lemma
|
|
||||||
int sense
|
|
||||||
int head
|
|
||||||
int dep_tag
|
|
||||||
uint32_t l_kids
|
|
||||||
uint32_t r_kids
|
|
||||||
|
|
||||||
|
|
||||||
ctypedef const Lexeme* const_Lexeme_ptr
|
ctypedef const Lexeme* const_Lexeme_ptr
|
||||||
|
@ -37,7 +22,7 @@ ctypedef fused LexemeOrToken:
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef Language lang
|
cdef StringStore strings
|
||||||
cdef list tag_names
|
cdef list tag_names
|
||||||
|
|
||||||
cdef TokenC* data
|
cdef TokenC* data
|
||||||
|
@ -51,7 +36,7 @@ cdef class Tokens:
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef public Language lang
|
cdef public StringStore strings
|
||||||
cdef public int i
|
cdef public int i
|
||||||
cdef public int idx
|
cdef public int idx
|
||||||
cdef int pos
|
cdef int pos
|
||||||
|
@ -59,18 +44,18 @@ cdef class Token:
|
||||||
cdef public int head
|
cdef public int head
|
||||||
cdef public int dep_tag
|
cdef public int dep_tag
|
||||||
|
|
||||||
cdef public atom_t id
|
cdef public attr_t id
|
||||||
cdef public atom_t cluster
|
cdef public attr_t cluster
|
||||||
cdef public atom_t length
|
cdef public attr_t length
|
||||||
cdef public atom_t postype
|
cdef public attr_t postype
|
||||||
cdef public atom_t sensetype
|
cdef public attr_t sensetype
|
||||||
|
|
||||||
cdef public atom_t sic
|
cdef public attr_t sic
|
||||||
cdef public atom_t norm
|
cdef public attr_t norm
|
||||||
cdef public atom_t shape
|
cdef public attr_t shape
|
||||||
cdef public atom_t asciied
|
cdef public attr_t asciied
|
||||||
cdef public atom_t prefix
|
cdef public attr_t prefix
|
||||||
cdef public atom_t suffix
|
cdef public attr_t suffix
|
||||||
|
|
||||||
cdef public float prob
|
cdef public float prob
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from preshed.counter cimport PreshCounter
|
from preshed.counter cimport PreshCounter
|
||||||
|
|
||||||
from .lexeme cimport *
|
from .lexeme cimport get_attr, EMPTY_LEXEME, LEMMA, attr_id_t
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -30,8 +30,8 @@ cdef class Tokens:
|
||||||
>>> from spacy.en import EN
|
>>> from spacy.en import EN
|
||||||
>>> tokens = EN.tokenize('An example sentence.')
|
>>> tokens = EN.tokenize('An example sentence.')
|
||||||
"""
|
"""
|
||||||
def __init__(self, Language lang, string_length=0):
|
def __init__(self, StringStore string_store, string_length=0):
|
||||||
self.lang = lang
|
self.string_store = string_store
|
||||||
if string_length >= 3:
|
if string_length >= 3:
|
||||||
size = int(string_length / 3.0)
|
size = int(string_length / 3.0)
|
||||||
else:
|
else:
|
||||||
|
@ -50,7 +50,7 @@ cdef class Tokens:
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
|
return Token(self.string_store, i, self.data[i].idx, self.data[i].pos,
|
||||||
self.data[i].lemma, self.data[i].head, self.data[i].dep_tag,
|
self.data[i].lemma, self.data[i].head, self.data[i].dep_tag,
|
||||||
self.data[i].lex[0])
|
self.data[i].lex[0])
|
||||||
|
|
||||||
|
@ -97,20 +97,6 @@ cdef class Tokens:
|
||||||
counts.inc(attr, 1)
|
counts.inc(attr, 1)
|
||||||
return dict(counts)
|
return dict(counts)
|
||||||
|
|
||||||
def base_nps(self):
|
|
||||||
# Iterate backwards, looking for nouns, and if we're collecting, for an
|
|
||||||
# outside-NP word. We want greedy matching, so it's easier to find the noun.
|
|
||||||
cdef TokenC* token
|
|
||||||
cdef int end = -1
|
|
||||||
for i in range(self.length-1, -1, -1):
|
|
||||||
token = &self.data[i]
|
|
||||||
if end == -1:
|
|
||||||
if self.lang.is_base_np_end(token):
|
|
||||||
end = i
|
|
||||||
elif self.lang.is_outside_base_np(token):
|
|
||||||
yield i-1, end
|
|
||||||
end = -1
|
|
||||||
|
|
||||||
def _realloc(self, new_size):
|
def _realloc(self, new_size):
|
||||||
self.max_length = new_size
|
self.max_length = new_size
|
||||||
n = new_size + (PADDING * 2)
|
n = new_size + (PADDING * 2)
|
||||||
|
@ -129,9 +115,9 @@ cdef class Tokens:
|
||||||
|
|
||||||
@cython.freelist(64)
|
@cython.freelist(64)
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
def __init__(self, Language lang, int i, int idx,
|
def __init__(self, StringStore string_store, int i, int idx,
|
||||||
int pos, int lemma, int head, int dep_tag, dict lex):
|
int pos, int lemma, int head, int dep_tag, dict lex):
|
||||||
self.lang = lang
|
self.string_store = string_store
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.pos = pos
|
self.pos = pos
|
||||||
self.i = i
|
self.i = i
|
||||||
|
@ -158,14 +144,14 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.sic == 0:
|
if self.sic == 0:
|
||||||
return ''
|
return ''
|
||||||
cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
|
cdef bytes utf8string = self.string_store[self.sic]
|
||||||
return utf8string.decode('utf8')
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
property lemma:
|
property lemma:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.lemma == 0:
|
if self.lemma == 0:
|
||||||
return self.string
|
return self.string
|
||||||
cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
|
cdef bytes utf8string = self.string_store[self.lemma]
|
||||||
return utf8string.decode('utf8')
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
property pos:
|
property pos:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user