mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 05:07:03 +03:00
* Improve efficiency of tagger, and improve morphological processing
This commit is contained in:
parent
6b34a2f34b
commit
42973c4b37
18
spacy/en.pxd
18
spacy/en.pxd
|
@ -125,23 +125,5 @@ cpdef enum:
|
||||||
N_CONTEXT_FIELDS
|
N_CONTEXT_FIELDS
|
||||||
|
|
||||||
|
|
||||||
cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
|
|
||||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
|
||||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
|
||||||
_fill_from_token(&context[W_sic], &tokens[i])
|
|
||||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
|
||||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|
||||||
context[0] = t.lex.sic
|
|
||||||
context[1] = t.lex.cluster
|
|
||||||
context[2] = t.lex.shape
|
|
||||||
context[3] = t.lex.prefix
|
|
||||||
context[4] = t.lex.suffix
|
|
||||||
context[5] = t.pos
|
|
||||||
context[6] = t.sense
|
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
pass
|
pass
|
||||||
|
|
44
spacy/en.pyx
44
spacy/en.pyx
|
@ -151,10 +151,14 @@ cdef class English(Language):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
cdef TokenC* t = tokens.data
|
cdef TokenC* t = tokens.data
|
||||||
|
assert self.morphologizer is not None
|
||||||
|
cdef dict tagdict = self.pos_tagger.tagdict
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_pos_context(context, i, t)
|
if t[i].lex.sic in tagdict:
|
||||||
t[i].pos = self.pos_tagger.predict(context)
|
t[i].pos = tagdict[t[i].lex.sic]
|
||||||
if self.morphologizer:
|
else:
|
||||||
|
fill_pos_context(context, i, t)
|
||||||
|
t[i].pos = self.pos_tagger.predict(context)
|
||||||
self.morphologizer.set_morph(i, t)
|
self.morphologizer.set_morph(i, t)
|
||||||
|
|
||||||
def train_pos(self, Tokens tokens, golds):
|
def train_pos(self, Tokens tokens, golds):
|
||||||
|
@ -165,27 +169,27 @@ cdef class English(Language):
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_pos_context(context, i, t)
|
fill_pos_context(context, i, t)
|
||||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||||
if self.morphologizer:
|
self.morphologizer.set_morph(i, t)
|
||||||
self.morphologizer.set_morph(i, t)
|
|
||||||
c += t[i].pos == golds[i]
|
c += t[i].pos == golds[i]
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
||||||
cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
|
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||||
if tok_morph.number == 0:
|
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||||
tok_morph.number = pos_morph.number
|
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||||
if tok_morph.tenspect == 0:
|
_fill_from_token(&context[W_sic], &tokens[i])
|
||||||
tok_morph.tenspect = pos_morph.tenspect
|
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||||
if tok_morph.mood == 0:
|
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||||
tok_morph.mood = pos_morph.mood
|
|
||||||
if tok_morph.gender == 0:
|
|
||||||
tok_morph.gender = pos_morph.gender
|
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
if tok_morph.person == 0:
|
context[0] = t.lex.sic
|
||||||
tok_morph.person = pos_morph.person
|
context[1] = t.lex.cluster
|
||||||
if tok_morph.case == 0:
|
context[2] = t.lex.shape
|
||||||
tok_morph.case = pos_morph.case
|
context[3] = t.lex.prefix
|
||||||
if tok_morph.misc == 0:
|
context[4] = t.lex.suffix
|
||||||
tok_morph.misc = pos_morph.misc
|
context[5] = t.pos
|
||||||
|
context[6] = t.sense
|
||||||
|
|
||||||
|
|
||||||
EN = English('en')
|
EN = English('en')
|
||||||
|
|
|
@ -35,8 +35,8 @@ cdef class Morphologizer:
|
||||||
cdef StringStore strings
|
cdef StringStore strings
|
||||||
cdef object lemmatizer
|
cdef object lemmatizer
|
||||||
cdef PosTag* tags
|
cdef PosTag* tags
|
||||||
|
cdef readonly list tag_names
|
||||||
|
|
||||||
cdef PreshMapArray _morph
|
cdef PreshMapArray _cache
|
||||||
cdef PreshMapArray _lemmas
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
# cython: profile=True
|
||||||
|
# cython: embedsignature=True
|
||||||
from os import path
|
from os import path
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
from .typedefs cimport id_t
|
||||||
|
|
||||||
UNIV_TAGS = {
|
UNIV_TAGS = {
|
||||||
'NULL': NO_TAG,
|
'NULL': NO_TAG,
|
||||||
|
@ -22,6 +24,11 @@ UNIV_TAGS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct _Cached:
|
||||||
|
Morphology morph
|
||||||
|
int lemma
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphologizer:
|
cdef class Morphologizer:
|
||||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||||
"""
|
"""
|
||||||
|
@ -30,12 +37,11 @@ cdef class Morphologizer:
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||||
tag_map = cfg['tag_map']
|
tag_map = cfg['tag_map']
|
||||||
tag_names = cfg['tag_names']
|
self.tag_names = cfg['tag_names']
|
||||||
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
|
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
|
||||||
self._lemmas = PreshMapArray(N_UNIV_TAGS)
|
self._cache = PreshMapArray(len(self.tag_names))
|
||||||
self._morph = PreshMapArray(len(tag_names))
|
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||||
self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
|
for i, tag in enumerate(self.tag_names):
|
||||||
for i, tag in enumerate(tag_names):
|
|
||||||
pos, props = tag_map[tag]
|
pos, props = tag_map[tag]
|
||||||
self.tags[i].id = i
|
self.tags[i].id = i
|
||||||
self.tags[i].pos = pos
|
self.tags[i].pos = pos
|
||||||
|
@ -46,15 +52,15 @@ cdef class Morphologizer:
|
||||||
self.tags[i].morph.person = props.get('person', 0)
|
self.tags[i].morph.person = props.get('person', 0)
|
||||||
self.tags[i].morph.case = props.get('case', 0)
|
self.tags[i].morph.case = props.get('case', 0)
|
||||||
self.tags[i].morph.misc = props.get('misc', 0)
|
self.tags[i].morph.misc = props.get('misc', 0)
|
||||||
|
if path.exists(path.join(data_dir, 'morph.json')):
|
||||||
|
with open(path.join(data_dir, 'morph.json')) as file_:
|
||||||
|
self.load_exceptions(json.loads(file_))
|
||||||
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return lex.sic
|
return lex.sic
|
||||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||||
return lex.sic
|
return lex.sic
|
||||||
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
|
|
||||||
if lemma != 0:
|
|
||||||
return lemma
|
|
||||||
cdef bytes py_string = self.strings[lex.sic]
|
cdef bytes py_string = self.strings[lex.sic]
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef bytes lemma_string
|
cdef bytes lemma_string
|
||||||
|
@ -67,15 +73,45 @@ cdef class Morphologizer:
|
||||||
lemma_strings = self.lemmatizer.adj(py_string)
|
lemma_strings = self.lemmatizer.adj(py_string)
|
||||||
lemma_string = sorted(lemma_strings)[0]
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||||
self._lemmas.set(pos, lex.sic, <void*>lemma)
|
|
||||||
return lemma
|
return lemma
|
||||||
|
|
||||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||||
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||||
tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
|
||||||
morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
|
if cached is NULL:
|
||||||
if morph is NULL:
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
|
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||||
tokens[i].morph = tag.morph
|
cached.morph = tag.morph
|
||||||
else:
|
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||||
tokens[i].morph = morph[0]
|
|
||||||
|
tokens[i].lemma = cached.lemma
|
||||||
|
tokens[i].morph = cached.morph
|
||||||
|
|
||||||
|
def load_exceptions(self, dict exc):
|
||||||
|
cdef unicode pos_str
|
||||||
|
cdef unicode form_str
|
||||||
|
cdef unicode lemma_str
|
||||||
|
cdef dict entries
|
||||||
|
cdef dict props
|
||||||
|
cdef int lemma
|
||||||
|
cdef id_t sic
|
||||||
|
cdef univ_tag_t pos
|
||||||
|
for pos_str, entries in exc.items():
|
||||||
|
pos = self.tag_names.index(pos_str)
|
||||||
|
for form_str, props in entries.items():
|
||||||
|
lemma_str = props.get('L', form_str)
|
||||||
|
sic = self.strings[form_str]
|
||||||
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
|
cached.lemma = self.strings[lemma_str]
|
||||||
|
set_morph_from_dict(&cached.morph, props)
|
||||||
|
self._cache.set(pos, sic, <void*>cached)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||||
|
morph.number = props.get('number', 0)
|
||||||
|
morph.tenspect = props.get('tenspect', 0)
|
||||||
|
morph.mood = props.get('mood', 0)
|
||||||
|
morph.gender = props.get('gender', 0)
|
||||||
|
morph.person = props.get('person', 0)
|
||||||
|
morph.case = props.get('case', 0)
|
||||||
|
morph.misc = props.get('misc', 0)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
|
import re
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||||
|
|
||||||
from preshed.maps cimport PreshMapArray
|
from preshed.maps cimport PreshMapArray
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t, id_t
|
||||||
from .tokens cimport Tokens, Morphology
|
from .tokens cimport Tokens, Morphology
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -72,10 +72,9 @@ cdef class Tagger:
|
||||||
return tag_id
|
return tag_id
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _make_tag_dict(counts):
|
def _make_tag_dict(counts):
|
||||||
freq_thresh = 50
|
freq_thresh = 20
|
||||||
ambiguity_thresh = 0.98
|
ambiguity_thresh = 0.97
|
||||||
tagdict = {}
|
tagdict = {}
|
||||||
cdef atom_t word
|
cdef atom_t word
|
||||||
cdef atom_t tag
|
cdef atom_t tag
|
||||||
|
|
Loading…
Reference in New Issue
Block a user