* Working morphology and lemmatisation. POS tagging quite fast.

This commit is contained in:
Matthew Honnibal 2014-12-10 08:09:32 +11:00
parent 7831b06610
commit 9959a64f7b
8 changed files with 48 additions and 38 deletions

View File

@ -53,7 +53,7 @@ cdef class Language:
cpdef Tokens tokens_from_list(self, list strings): cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings]) cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.lexicon.strings, length) cdef Tokens tokens = Tokens(self, length)
if length == 0: if length == 0:
return tokens return tokens
cdef UniStr string_struct cdef UniStr string_struct
@ -81,7 +81,7 @@ cdef class Language:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
""" """
cdef int length = len(string) cdef int length = len(string)
cdef Tokens tokens = Tokens(self.lexicon.strings, length) cdef Tokens tokens = Tokens(self, length)
if length == 0: if length == 0:
return tokens return tokens
cdef int i = 0 cdef int i = 0
@ -110,8 +110,7 @@ cdef class Language:
return tokens return tokens
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
cached = <Cached*>self._specials.get(key) #cached = <Cached*>self._specials.get(key)
if cached == NULL:
cached = <Cached*>self._cache.get(key) cached = <Cached*>self._cache.get(key)
if cached == NULL: if cached == NULL:
return False return False
@ -266,6 +265,7 @@ cdef class Language:
cached.data.tokens = tokens cached.data.tokens = tokens
slice_unicode(&string, chunk, 0, len(chunk)) slice_unicode(&string, chunk, 0, len(chunk))
self._specials.set(string.key, cached) self._specials.set(string.key, cached)
self._cache.set(string.key, cached)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:

View File

@ -80,6 +80,7 @@ cpdef enum attr_id_t:
LENGTH LENGTH
CLUSTER CLUSTER
POS_TYPE POS_TYPE
LEMMA
cdef struct Lexeme: cdef struct Lexeme:

View File

@ -1,10 +1,13 @@
from .tokens cimport TokenC, Morphology
from .tokens cimport TokenC
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .utf8string cimport StringStore from .utf8string cimport StringStore
from .typedefs cimport id_t, Morphology
from preshed.maps cimport PreshMapArray from preshed.maps cimport PreshMapArray
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
# Google universal tag set # Google universal tag set
cpdef enum univ_tag_t: cpdef enum univ_tag_t:
NO_TAG NO_TAG

View File

@ -52,9 +52,9 @@ cdef class Morphologizer:
self.tags[i].morph.person = props.get('person', 0) self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0) self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0) self.tags[i].morph.misc = props.get('misc', 0)
if path.exists(path.join(data_dir, 'morph.json')): if path.exists(path.join(data_dir, 'morphs.json')):
with open(path.join(data_dir, 'morph.json')) as file_: with open(path.join(data_dir, 'morphs.json')) as file_:
self.load_exceptions(json.loads(file_)) self.load_exceptions(json.load(file_))
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None: if self.lemmatizer is None:

View File

@ -9,7 +9,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t, id_t from .typedefs cimport hash_t, id_t
from .tokens cimport Tokens, Morphology from .tokens cimport Tokens
cdef class Tagger: cdef class Tagger:

View File

@ -7,19 +7,10 @@ from thinc.typedefs cimport atom_t
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .typedefs cimport flags_t from .typedefs cimport flags_t
from .utf8string cimport StringStore from .typedefs cimport Morphology
from libc.stdint cimport uint8_t, uint16_t from .lang cimport Language
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc
cdef struct TokenC: cdef struct TokenC:
const Lexeme* lex const Lexeme* lex
@ -40,7 +31,8 @@ ctypedef fused LexemeOrToken:
cdef class Tokens: cdef class Tokens:
cdef Pool mem cdef Pool mem
cdef StringStore _string_store cdef Language lang
cdef list tag_names
cdef TokenC* data cdef TokenC* data
@ -48,16 +40,15 @@ cdef class Tokens:
cdef int max_length cdef int max_length
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
cpdef int set_tag(self, int i, int tag_type, int tag) except -1
cpdef np.ndarray[long, ndim=2] get_array(self, list features) cpdef np.ndarray[long, ndim=2] get_array(self, list features)
cdef class Token: cdef class Token:
cdef StringStore _string_store cdef public Language lang
cdef public int i cdef public int i
cdef public int idx cdef public int idx
cdef public int pos cdef int pos
cdef int lemma cdef int lemma
cdef public atom_t id cdef public atom_t id

View File

@ -30,8 +30,8 @@ cdef class Tokens:
>>> from spacy.en import EN >>> from spacy.en import EN
>>> tokens = EN.tokenize('An example sentence.') >>> tokens = EN.tokenize('An example sentence.')
""" """
def __init__(self, StringStore string_store, string_length=0): def __init__(self, Language lang, string_length=0):
self._string_store = string_store self.lang = lang
if string_length >= 3: if string_length >= 3:
size = int(string_length / 3.0) size = int(string_length / 3.0)
else: else:
@ -50,7 +50,7 @@ cdef class Tokens:
def __getitem__(self, i): def __getitem__(self, i):
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos, return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
self.data[i].lemma, self.data[i].lex[0]) self.data[i].lemma, self.data[i].lex[0])
def __iter__(self): def __iter__(self):
@ -71,9 +71,6 @@ cdef class Tokens:
self.length += 1 self.length += 1
return idx + t.lex.length return idx + t.lex.length
cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
self.data[i].pos = tag
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids): cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
cdef int i, j cdef int i, j
@ -92,6 +89,9 @@ cdef class Tokens:
cdef PreshCounter counts = PreshCounter(2 ** 8) cdef PreshCounter counts = PreshCounter(2 ** 8)
for i in range(self.length): for i in range(self.length):
if attr_id == LEMMA:
attr = self.data[i].lemma
else:
attr = get_attr(self.data[i].lex, attr_id) attr = get_attr(self.data[i].lex, attr_id)
counts.inc(attr, 1) counts.inc(attr, 1)
return dict(counts) return dict(counts)
@ -114,9 +114,9 @@ cdef class Tokens:
@cython.freelist(64) @cython.freelist(64)
cdef class Token: cdef class Token:
def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma, def __init__(self, Language lang, int i, int idx,
dict lex): int pos, int lemma, dict lex):
self._string_store = string_store self.lang = lang
self.idx = idx self.idx = idx
self.pos = pos self.pos = pos
self.i = i self.i = i
@ -141,12 +141,16 @@ cdef class Token:
def __get__(self): def __get__(self):
if self.sic == 0: if self.sic == 0:
return '' return ''
cdef bytes utf8string = self._string_store[self.sic] cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
return utf8string.decode('utf8') return utf8string.decode('utf8')
property lemma: property lemma:
def __get__(self): def __get__(self):
if self.lemma == 0: if self.lemma == 0:
return self.string return self.string
cdef bytes utf8string = self._string_store[self.lemma] cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
return utf8string.decode('utf8') return utf8string.decode('utf8')
property pos:
def __get__(self):
return self.lang.pos_tagger.tag_names[self.pos]

View File

@ -1,4 +1,5 @@
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
from libc.stdint cimport uint8_t
ctypedef uint64_t hash_t ctypedef uint64_t hash_t
ctypedef char* utf8_t ctypedef char* utf8_t
@ -7,3 +8,13 @@ ctypedef uint64_t flags_t
ctypedef uint32_t id_t ctypedef uint32_t id_t
ctypedef uint16_t len_t ctypedef uint16_t len_t
ctypedef uint16_t tag_t ctypedef uint16_t tag_t
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc