mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 17:52:31 +03:00
* POS tag memoisation working, with good speed-up
This commit is contained in:
parent
ca54d58638
commit
792802b2b9
33
spacy/en.pyx
33
spacy/en.pyx
|
@ -32,8 +32,10 @@ provides a fully Penn Treebank 3-compliant tokenizer.
|
||||||
'''
|
'''
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
cimport lang
|
cimport lang
|
||||||
from .typedefs cimport flags_t
|
from .typedefs cimport hash_t, id_t, flags_t
|
||||||
import orth
|
import orth
|
||||||
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||||
from .morphology cimport X, PUNCT, EOL
|
from .morphology cimport X, PUNCT, EOL
|
||||||
|
@ -41,6 +43,9 @@ from .morphology cimport X, PUNCT, EOL
|
||||||
from .tokens cimport Morphology
|
from .tokens cimport Morphology
|
||||||
|
|
||||||
|
|
||||||
|
DEF USE_POS_CACHE = True
|
||||||
|
|
||||||
|
|
||||||
POS_TAGS = {
|
POS_TAGS = {
|
||||||
'NULL': (NO_TAG, {}),
|
'NULL': (NO_TAG, {}),
|
||||||
'EOL': (EOL, {}),
|
'EOL': (EOL, {}),
|
||||||
|
@ -134,6 +139,20 @@ cdef class English(Language):
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
"""
|
"""
|
||||||
|
def load_pos_cache(self, loc):
|
||||||
|
cdef int i = 0
|
||||||
|
cdef hash_t key
|
||||||
|
cdef int pos
|
||||||
|
with open(loc) as file_:
|
||||||
|
for line in file_:
|
||||||
|
pieces = line.split()
|
||||||
|
if i >= 500000:
|
||||||
|
break
|
||||||
|
i += 1
|
||||||
|
key = int(pieces[1])
|
||||||
|
pos = int(pieces[2])
|
||||||
|
self._pos_cache.set(key, <void*>pos)
|
||||||
|
|
||||||
def get_props(self, unicode string):
|
def get_props(self, unicode string):
|
||||||
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
||||||
|
|
||||||
|
@ -156,11 +175,19 @@ cdef class English(Language):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
cdef TokenC* t = tokens.data
|
cdef TokenC* t = tokens.data
|
||||||
|
cdef id_t[2] bigram
|
||||||
|
cdef hash_t cache_key
|
||||||
|
cdef void* cached = NULL
|
||||||
assert self.morphologizer is not None
|
assert self.morphologizer is not None
|
||||||
cdef dict tagdict = self.pos_tagger.tagdict
|
cdef dict tagdict = self.pos_tagger.tagdict
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
if t[i].lex.sic in tagdict:
|
if USE_POS_CACHE:
|
||||||
t[i].pos = tagdict[t[i].lex.sic]
|
bigram[0] = tokens.data[i].lex.sic
|
||||||
|
bigram[1] = tokens.data[i-1].lex.sic
|
||||||
|
cache_key = hash64(bigram, sizeof(id_t) * 2, 0)
|
||||||
|
cached = self._pos_cache.get(cache_key)
|
||||||
|
if cached != NULL:
|
||||||
|
t[i].pos = <int><size_t>cached
|
||||||
else:
|
else:
|
||||||
fill_pos_context(context, i, t)
|
fill_pos_context(context, i, t)
|
||||||
t[i].pos = self.pos_tagger.predict(context)
|
t[i].pos = self.pos_tagger.predict(context)
|
||||||
|
|
|
@ -44,6 +44,7 @@ cdef class Language:
|
||||||
cpdef readonly Tagger pos_tagger
|
cpdef readonly Tagger pos_tagger
|
||||||
cpdef readonly Morphologizer morphologizer
|
cpdef readonly Morphologizer morphologizer
|
||||||
|
|
||||||
|
cdef PreshMap _pos_cache
|
||||||
cdef object _prefix_re
|
cdef object _prefix_re
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
|
|
@ -34,6 +34,7 @@ cdef class Language:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap(2 ** 25)
|
self._cache = PreshMap(2 ** 25)
|
||||||
self._specials = PreshMap(2 ** 16)
|
self._specials = PreshMap(2 ** 16)
|
||||||
|
self._pos_cache = PreshMap(2 ** 16)
|
||||||
rules, prefix, suffix, infix = util.read_lang_data(name)
|
rules, prefix, suffix, infix = util.read_lang_data(name)
|
||||||
self._prefix_re = re.compile(prefix)
|
self._prefix_re = re.compile(prefix)
|
||||||
self._suffix_re = re.compile(suffix)
|
self._suffix_re = re.compile(suffix)
|
||||||
|
@ -50,6 +51,7 @@ cdef class Language:
|
||||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||||
self.morphologizer = Morphologizer(self.lexicon.strings,
|
self.morphologizer = Morphologizer(self.lexicon.strings,
|
||||||
path.join(util.DATA_DIR, self.name))
|
path.join(util.DATA_DIR, self.name))
|
||||||
|
self.load_pos_cache(path.join(util.DATA_DIR, self.name, 'pos', 'bigram_cache_2m'))
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user