mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Working morphology and lemmatisation. POS tagging quite fast.
This commit is contained in:
parent
7831b06610
commit
9959a64f7b
|
@ -53,7 +53,7 @@ cdef class Language:
|
|||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||
cdef Tokens tokens = Tokens(self, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef UniStr string_struct
|
||||
|
@ -81,7 +81,7 @@ cdef class Language:
|
|||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||
"""
|
||||
cdef int length = len(string)
|
||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||
cdef Tokens tokens = Tokens(self, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef int i = 0
|
||||
|
@ -110,11 +110,10 @@ cdef class Language:
|
|||
return tokens
|
||||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||
cached = <Cached*>self._specials.get(key)
|
||||
#cached = <Cached*>self._specials.get(key)
|
||||
cached = <Cached*>self._cache.get(key)
|
||||
if cached == NULL:
|
||||
cached = <Cached*>self._cache.get(key)
|
||||
if cached == NULL:
|
||||
return False
|
||||
return False
|
||||
cdef int i
|
||||
if cached.is_lex:
|
||||
for i in range(cached.length):
|
||||
|
@ -266,6 +265,7 @@ cdef class Language:
|
|||
cached.data.tokens = tokens
|
||||
slice_unicode(&string, chunk, 0, len(chunk))
|
||||
self._specials.set(string.key, cached)
|
||||
self._cache.set(string.key, cached)
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
|
|
|
@ -80,6 +80,7 @@ cpdef enum attr_id_t:
|
|||
LENGTH
|
||||
CLUSTER
|
||||
POS_TYPE
|
||||
LEMMA
|
||||
|
||||
|
||||
cdef struct Lexeme:
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
from .tokens cimport TokenC, Morphology
|
||||
|
||||
from .tokens cimport TokenC
|
||||
from .lexeme cimport Lexeme
|
||||
from .utf8string cimport StringStore
|
||||
from .typedefs cimport id_t, Morphology
|
||||
|
||||
from preshed.maps cimport PreshMapArray
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
|
||||
# Google universal tag set
|
||||
cpdef enum univ_tag_t:
|
||||
NO_TAG
|
||||
|
|
|
@ -52,9 +52,9 @@ cdef class Morphologizer:
|
|||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
if path.exists(path.join(data_dir, 'morph.json')):
|
||||
with open(path.join(data_dir, 'morph.json')) as file_:
|
||||
self.load_exceptions(json.loads(file_))
|
||||
if path.exists(path.join(data_dir, 'morphs.json')):
|
||||
with open(path.join(data_dir, 'morphs.json')) as file_:
|
||||
self.load_exceptions(json.load(file_))
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
|
|
|
@ -9,7 +9,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
|||
from preshed.maps cimport PreshMapArray
|
||||
|
||||
from .typedefs cimport hash_t, id_t
|
||||
from .tokens cimport Tokens, Morphology
|
||||
from .tokens cimport Tokens
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
|
|
|
@ -7,19 +7,10 @@ from thinc.typedefs cimport atom_t
|
|||
from .lexeme cimport Lexeme
|
||||
|
||||
from .typedefs cimport flags_t
|
||||
from .utf8string cimport StringStore
|
||||
from libc.stdint cimport uint8_t, uint16_t
|
||||
from .typedefs cimport Morphology
|
||||
from .lang cimport Language
|
||||
|
||||
|
||||
cdef struct Morphology:
|
||||
uint8_t number
|
||||
uint8_t tenspect # Tense/aspect/voice
|
||||
uint8_t mood
|
||||
uint8_t gender
|
||||
uint8_t person
|
||||
uint8_t case
|
||||
uint8_t misc
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
const Lexeme* lex
|
||||
|
@ -40,7 +31,8 @@ ctypedef fused LexemeOrToken:
|
|||
|
||||
cdef class Tokens:
|
||||
cdef Pool mem
|
||||
cdef StringStore _string_store
|
||||
cdef Language lang
|
||||
cdef list tag_names
|
||||
|
||||
cdef TokenC* data
|
||||
|
||||
|
@ -48,16 +40,15 @@ cdef class Tokens:
|
|||
cdef int max_length
|
||||
|
||||
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
||||
cpdef int set_tag(self, int i, int tag_type, int tag) except -1
|
||||
|
||||
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
|
||||
|
||||
|
||||
cdef class Token:
|
||||
cdef StringStore _string_store
|
||||
cdef public Language lang
|
||||
cdef public int i
|
||||
cdef public int idx
|
||||
cdef public int pos
|
||||
cdef int pos
|
||||
cdef int lemma
|
||||
|
||||
cdef public atom_t id
|
||||
|
|
|
@ -30,8 +30,8 @@ cdef class Tokens:
|
|||
>>> from spacy.en import EN
|
||||
>>> tokens = EN.tokenize('An example sentence.')
|
||||
"""
|
||||
def __init__(self, StringStore string_store, string_length=0):
|
||||
self._string_store = string_store
|
||||
def __init__(self, Language lang, string_length=0):
|
||||
self.lang = lang
|
||||
if string_length >= 3:
|
||||
size = int(string_length / 3.0)
|
||||
else:
|
||||
|
@ -50,7 +50,7 @@ cdef class Tokens:
|
|||
|
||||
def __getitem__(self, i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
|
||||
return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
|
||||
self.data[i].lemma, self.data[i].lex[0])
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -71,9 +71,6 @@ cdef class Tokens:
|
|||
self.length += 1
|
||||
return idx + t.lex.length
|
||||
|
||||
cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
|
||||
self.data[i].pos = tag
|
||||
|
||||
@cython.boundscheck(False)
|
||||
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
|
||||
cdef int i, j
|
||||
|
@ -92,7 +89,10 @@ cdef class Tokens:
|
|||
|
||||
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||
for i in range(self.length):
|
||||
attr = get_attr(self.data[i].lex, attr_id)
|
||||
if attr_id == LEMMA:
|
||||
attr = self.data[i].lemma
|
||||
else:
|
||||
attr = get_attr(self.data[i].lex, attr_id)
|
||||
counts.inc(attr, 1)
|
||||
return dict(counts)
|
||||
|
||||
|
@ -114,9 +114,9 @@ cdef class Tokens:
|
|||
|
||||
@cython.freelist(64)
|
||||
cdef class Token:
|
||||
def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
|
||||
dict lex):
|
||||
self._string_store = string_store
|
||||
def __init__(self, Language lang, int i, int idx,
|
||||
int pos, int lemma, dict lex):
|
||||
self.lang = lang
|
||||
self.idx = idx
|
||||
self.pos = pos
|
||||
self.i = i
|
||||
|
@ -141,12 +141,16 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
if self.sic == 0:
|
||||
return ''
|
||||
cdef bytes utf8string = self._string_store[self.sic]
|
||||
cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
property lemma:
|
||||
def __get__(self):
|
||||
if self.lemma == 0:
|
||||
return self.string
|
||||
cdef bytes utf8string = self._string_store[self.lemma]
|
||||
cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
property pos:
|
||||
def __get__(self):
|
||||
return self.lang.pos_tagger.tag_names[self.pos]
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
||||
from libc.stdint cimport uint8_t
|
||||
|
||||
ctypedef uint64_t hash_t
|
||||
ctypedef char* utf8_t
|
||||
|
@ -7,3 +8,13 @@ ctypedef uint64_t flags_t
|
|||
ctypedef uint32_t id_t
|
||||
ctypedef uint16_t len_t
|
||||
ctypedef uint16_t tag_t
|
||||
|
||||
|
||||
cdef struct Morphology:
|
||||
uint8_t number
|
||||
uint8_t tenspect # Tense/aspect/voice
|
||||
uint8_t mood
|
||||
uint8_t gender
|
||||
uint8_t person
|
||||
uint8_t case
|
||||
uint8_t misc
|
||||
|
|
Loading…
Reference in New Issue
Block a user