* Working morphology and lemmatisation. POS tagging quite fast.

This commit is contained in:
Matthew Honnibal 2014-12-10 08:09:32 +11:00
parent 7831b06610
commit 9959a64f7b
8 changed files with 48 additions and 38 deletions

View File

@ -53,7 +53,7 @@ cdef class Language:
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
cdef Tokens tokens = Tokens(self, length)
if length == 0:
return tokens
cdef UniStr string_struct
@ -81,7 +81,7 @@ cdef class Language:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
"""
cdef int length = len(string)
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
cdef Tokens tokens = Tokens(self, length)
if length == 0:
return tokens
cdef int i = 0
@ -110,11 +110,10 @@ cdef class Language:
return tokens
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
cached = <Cached*>self._specials.get(key)
#cached = <Cached*>self._specials.get(key)
cached = <Cached*>self._cache.get(key)
if cached == NULL:
cached = <Cached*>self._cache.get(key)
if cached == NULL:
return False
return False
cdef int i
if cached.is_lex:
for i in range(cached.length):
@ -266,6 +265,7 @@ cdef class Language:
cached.data.tokens = tokens
slice_unicode(&string, chunk, 0, len(chunk))
self._specials.set(string.key, cached)
self._cache.set(string.key, cached)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:

View File

@ -80,6 +80,7 @@ cpdef enum attr_id_t:
LENGTH
CLUSTER
POS_TYPE
LEMMA
cdef struct Lexeme:

View File

@ -1,10 +1,13 @@
from .tokens cimport TokenC, Morphology
from .tokens cimport TokenC
from .lexeme cimport Lexeme
from .utf8string cimport StringStore
from .typedefs cimport id_t, Morphology
from preshed.maps cimport PreshMapArray
from cymem.cymem cimport Pool
# Google universal tag set
cpdef enum univ_tag_t:
NO_TAG

View File

@ -52,9 +52,9 @@ cdef class Morphologizer:
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
if path.exists(path.join(data_dir, 'morph.json')):
with open(path.join(data_dir, 'morph.json')) as file_:
self.load_exceptions(json.loads(file_))
if path.exists(path.join(data_dir, 'morphs.json')):
with open(path.join(data_dir, 'morphs.json')) as file_:
self.load_exceptions(json.load(file_))
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:

View File

@ -9,7 +9,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t, id_t
from .tokens cimport Tokens, Morphology
from .tokens cimport Tokens
cdef class Tagger:

View File

@ -7,19 +7,10 @@ from thinc.typedefs cimport atom_t
from .lexeme cimport Lexeme
from .typedefs cimport flags_t
from .utf8string cimport StringStore
from libc.stdint cimport uint8_t, uint16_t
from .typedefs cimport Morphology
from .lang cimport Language
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc
cdef struct TokenC:
const Lexeme* lex
@ -40,7 +31,8 @@ ctypedef fused LexemeOrToken:
cdef class Tokens:
cdef Pool mem
cdef StringStore _string_store
cdef Language lang
cdef list tag_names
cdef TokenC* data
@ -48,16 +40,15 @@ cdef class Tokens:
cdef int max_length
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
cpdef int set_tag(self, int i, int tag_type, int tag) except -1
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
cdef class Token:
cdef StringStore _string_store
cdef public Language lang
cdef public int i
cdef public int idx
cdef public int pos
cdef int pos
cdef int lemma
cdef public atom_t id

View File

@ -30,8 +30,8 @@ cdef class Tokens:
>>> from spacy.en import EN
>>> tokens = EN.tokenize('An example sentence.')
"""
def __init__(self, StringStore string_store, string_length=0):
self._string_store = string_store
def __init__(self, Language lang, string_length=0):
self.lang = lang
if string_length >= 3:
size = int(string_length / 3.0)
else:
@ -50,7 +50,7 @@ cdef class Tokens:
def __getitem__(self, i):
bounds_check(i, self.length, PADDING)
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
self.data[i].lemma, self.data[i].lex[0])
def __iter__(self):
@ -71,9 +71,6 @@ cdef class Tokens:
self.length += 1
return idx + t.lex.length
cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
self.data[i].pos = tag
@cython.boundscheck(False)
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
cdef int i, j
@ -92,7 +89,10 @@ cdef class Tokens:
cdef PreshCounter counts = PreshCounter(2 ** 8)
for i in range(self.length):
attr = get_attr(self.data[i].lex, attr_id)
if attr_id == LEMMA:
attr = self.data[i].lemma
else:
attr = get_attr(self.data[i].lex, attr_id)
counts.inc(attr, 1)
return dict(counts)
@ -114,9 +114,9 @@ cdef class Tokens:
@cython.freelist(64)
cdef class Token:
def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
dict lex):
self._string_store = string_store
def __init__(self, Language lang, int i, int idx,
int pos, int lemma, dict lex):
self.lang = lang
self.idx = idx
self.pos = pos
self.i = i
@ -141,12 +141,16 @@ cdef class Token:
def __get__(self):
if self.sic == 0:
return ''
cdef bytes utf8string = self._string_store[self.sic]
cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
return utf8string.decode('utf8')
property lemma:
def __get__(self):
if self.lemma == 0:
return self.string
cdef bytes utf8string = self._string_store[self.lemma]
cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
return utf8string.decode('utf8')
property pos:
def __get__(self):
return self.lang.pos_tagger.tag_names[self.pos]

View File

@ -1,4 +1,5 @@
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
from libc.stdint cimport uint8_t
ctypedef uint64_t hash_t
ctypedef char* utf8_t
@ -7,3 +8,13 @@ ctypedef uint64_t flags_t
ctypedef uint32_t id_t
ctypedef uint16_t len_t
ctypedef uint16_t tag_t
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc