mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
* Working morphology and lemmatisation. POS tagging quite fast.
This commit is contained in:
parent
7831b06610
commit
9959a64f7b
|
@ -53,7 +53,7 @@ cdef class Language:
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
cdef Tokens tokens = Tokens(self, length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef UniStr string_struct
|
cdef UniStr string_struct
|
||||||
|
@ -81,7 +81,7 @@ cdef class Language:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||||
"""
|
"""
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
cdef Tokens tokens = Tokens(self, length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
@ -110,8 +110,7 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||||
cached = <Cached*>self._specials.get(key)
|
#cached = <Cached*>self._specials.get(key)
|
||||||
if cached == NULL:
|
|
||||||
cached = <Cached*>self._cache.get(key)
|
cached = <Cached*>self._cache.get(key)
|
||||||
if cached == NULL:
|
if cached == NULL:
|
||||||
return False
|
return False
|
||||||
|
@ -266,6 +265,7 @@ cdef class Language:
|
||||||
cached.data.tokens = tokens
|
cached.data.tokens = tokens
|
||||||
slice_unicode(&string, chunk, 0, len(chunk))
|
slice_unicode(&string, chunk, 0, len(chunk))
|
||||||
self._specials.set(string.key, cached)
|
self._specials.set(string.key, cached)
|
||||||
|
self._cache.set(string.key, cached)
|
||||||
|
|
||||||
|
|
||||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||||
|
|
|
@ -80,6 +80,7 @@ cpdef enum attr_id_t:
|
||||||
LENGTH
|
LENGTH
|
||||||
CLUSTER
|
CLUSTER
|
||||||
POS_TYPE
|
POS_TYPE
|
||||||
|
LEMMA
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
from .tokens cimport TokenC, Morphology
|
|
||||||
|
from .tokens cimport TokenC
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .utf8string cimport StringStore
|
from .utf8string cimport StringStore
|
||||||
|
from .typedefs cimport id_t, Morphology
|
||||||
|
|
||||||
from preshed.maps cimport PreshMapArray
|
from preshed.maps cimport PreshMapArray
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
|
||||||
# Google universal tag set
|
# Google universal tag set
|
||||||
cpdef enum univ_tag_t:
|
cpdef enum univ_tag_t:
|
||||||
NO_TAG
|
NO_TAG
|
||||||
|
|
|
@ -52,9 +52,9 @@ cdef class Morphologizer:
|
||||||
self.tags[i].morph.person = props.get('person', 0)
|
self.tags[i].morph.person = props.get('person', 0)
|
||||||
self.tags[i].morph.case = props.get('case', 0)
|
self.tags[i].morph.case = props.get('case', 0)
|
||||||
self.tags[i].morph.misc = props.get('misc', 0)
|
self.tags[i].morph.misc = props.get('misc', 0)
|
||||||
if path.exists(path.join(data_dir, 'morph.json')):
|
if path.exists(path.join(data_dir, 'morphs.json')):
|
||||||
with open(path.join(data_dir, 'morph.json')) as file_:
|
with open(path.join(data_dir, 'morphs.json')) as file_:
|
||||||
self.load_exceptions(json.loads(file_))
|
self.load_exceptions(json.load(file_))
|
||||||
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
|
|
|
@ -9,7 +9,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||||
from preshed.maps cimport PreshMapArray
|
from preshed.maps cimport PreshMapArray
|
||||||
|
|
||||||
from .typedefs cimport hash_t, id_t
|
from .typedefs cimport hash_t, id_t
|
||||||
from .tokens cimport Tokens, Morphology
|
from .tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
|
|
|
@ -7,19 +7,10 @@ from thinc.typedefs cimport atom_t
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
|
|
||||||
from .typedefs cimport flags_t
|
from .typedefs cimport flags_t
|
||||||
from .utf8string cimport StringStore
|
from .typedefs cimport Morphology
|
||||||
from libc.stdint cimport uint8_t, uint16_t
|
from .lang cimport Language
|
||||||
|
|
||||||
|
|
||||||
cdef struct Morphology:
|
|
||||||
uint8_t number
|
|
||||||
uint8_t tenspect # Tense/aspect/voice
|
|
||||||
uint8_t mood
|
|
||||||
uint8_t gender
|
|
||||||
uint8_t person
|
|
||||||
uint8_t case
|
|
||||||
uint8_t misc
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
const Lexeme* lex
|
const Lexeme* lex
|
||||||
|
@ -40,7 +31,8 @@ ctypedef fused LexemeOrToken:
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef StringStore _string_store
|
cdef Language lang
|
||||||
|
cdef list tag_names
|
||||||
|
|
||||||
cdef TokenC* data
|
cdef TokenC* data
|
||||||
|
|
||||||
|
@ -48,16 +40,15 @@ cdef class Tokens:
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
||||||
cpdef int set_tag(self, int i, int tag_type, int tag) except -1
|
|
||||||
|
|
||||||
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
|
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef StringStore _string_store
|
cdef public Language lang
|
||||||
cdef public int i
|
cdef public int i
|
||||||
cdef public int idx
|
cdef public int idx
|
||||||
cdef public int pos
|
cdef int pos
|
||||||
cdef int lemma
|
cdef int lemma
|
||||||
|
|
||||||
cdef public atom_t id
|
cdef public atom_t id
|
||||||
|
|
|
@ -30,8 +30,8 @@ cdef class Tokens:
|
||||||
>>> from spacy.en import EN
|
>>> from spacy.en import EN
|
||||||
>>> tokens = EN.tokenize('An example sentence.')
|
>>> tokens = EN.tokenize('An example sentence.')
|
||||||
"""
|
"""
|
||||||
def __init__(self, StringStore string_store, string_length=0):
|
def __init__(self, Language lang, string_length=0):
|
||||||
self._string_store = string_store
|
self.lang = lang
|
||||||
if string_length >= 3:
|
if string_length >= 3:
|
||||||
size = int(string_length / 3.0)
|
size = int(string_length / 3.0)
|
||||||
else:
|
else:
|
||||||
|
@ -50,7 +50,7 @@ cdef class Tokens:
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
|
return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
|
||||||
self.data[i].lemma, self.data[i].lex[0])
|
self.data[i].lemma, self.data[i].lex[0])
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -71,9 +71,6 @@ cdef class Tokens:
|
||||||
self.length += 1
|
self.length += 1
|
||||||
return idx + t.lex.length
|
return idx + t.lex.length
|
||||||
|
|
||||||
cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
|
|
||||||
self.data[i].pos = tag
|
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
|
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
|
@ -92,6 +89,9 @@ cdef class Tokens:
|
||||||
|
|
||||||
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
if attr_id == LEMMA:
|
||||||
|
attr = self.data[i].lemma
|
||||||
|
else:
|
||||||
attr = get_attr(self.data[i].lex, attr_id)
|
attr = get_attr(self.data[i].lex, attr_id)
|
||||||
counts.inc(attr, 1)
|
counts.inc(attr, 1)
|
||||||
return dict(counts)
|
return dict(counts)
|
||||||
|
@ -114,9 +114,9 @@ cdef class Tokens:
|
||||||
|
|
||||||
@cython.freelist(64)
|
@cython.freelist(64)
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
|
def __init__(self, Language lang, int i, int idx,
|
||||||
dict lex):
|
int pos, int lemma, dict lex):
|
||||||
self._string_store = string_store
|
self.lang = lang
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.pos = pos
|
self.pos = pos
|
||||||
self.i = i
|
self.i = i
|
||||||
|
@ -141,12 +141,16 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.sic == 0:
|
if self.sic == 0:
|
||||||
return ''
|
return ''
|
||||||
cdef bytes utf8string = self._string_store[self.sic]
|
cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
|
||||||
return utf8string.decode('utf8')
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
property lemma:
|
property lemma:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.lemma == 0:
|
if self.lemma == 0:
|
||||||
return self.string
|
return self.string
|
||||||
cdef bytes utf8string = self._string_store[self.lemma]
|
cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
|
||||||
return utf8string.decode('utf8')
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
|
property pos:
|
||||||
|
def __get__(self):
|
||||||
|
return self.lang.pos_tagger.tag_names[self.pos]
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
||||||
|
from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
ctypedef uint64_t hash_t
|
ctypedef uint64_t hash_t
|
||||||
ctypedef char* utf8_t
|
ctypedef char* utf8_t
|
||||||
|
@ -7,3 +8,13 @@ ctypedef uint64_t flags_t
|
||||||
ctypedef uint32_t id_t
|
ctypedef uint32_t id_t
|
||||||
ctypedef uint16_t len_t
|
ctypedef uint16_t len_t
|
||||||
ctypedef uint16_t tag_t
|
ctypedef uint16_t tag_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Morphology:
|
||||||
|
uint8_t number
|
||||||
|
uint8_t tenspect # Tense/aspect/voice
|
||||||
|
uint8_t mood
|
||||||
|
uint8_t gender
|
||||||
|
uint8_t person
|
||||||
|
uint8_t case
|
||||||
|
uint8_t misc
|
||||||
|
|
Loading…
Reference in New Issue
Block a user