mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* All tests now passing for reorg
This commit is contained in:
parent
73f200436f
commit
b00bc01d8c
|
@ -1,9 +1,21 @@
|
||||||
|
from preshed.maps cimport PreshMapArray
|
||||||
|
|
||||||
from ..tagger cimport Tagger
|
from ..tagger cimport Tagger
|
||||||
from ..morphology cimport Morphologizer
|
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
|
from ..structs cimport TokenC, Lexeme, Morphology, PosTag
|
||||||
|
from ..typedefs cimport univ_tag_t
|
||||||
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
cdef class EnPosTagger(Tagger):
|
cdef class EnPosTagger(Tagger):
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cdef readonly StringStore tags
|
cdef public object lemmatizer
|
||||||
cdef readonly Morphologizer morphologizer
|
cdef PreshMapArray _morph_cache
|
||||||
|
|
||||||
|
cdef PosTag* tags
|
||||||
|
cdef readonly object tag_names
|
||||||
|
cdef readonly object tag_map
|
||||||
|
|
||||||
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
||||||
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,13 @@ import json
|
||||||
|
|
||||||
from thinc.typedefs cimport atom_t
|
from thinc.typedefs cimport atom_t
|
||||||
|
|
||||||
|
from ..typedefs cimport univ_tag_t
|
||||||
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||||
from ..typedefs cimport X, PUNCT, EOL
|
from ..typedefs cimport X, PUNCT, EOL
|
||||||
from ..structs cimport TokenC, Morphology
|
from ..typedefs cimport id_t
|
||||||
|
from ..structs cimport TokenC, Morphology, Lexeme
|
||||||
from ..tokens cimport Tokens
|
from ..tokens cimport Tokens
|
||||||
|
from ..morphology cimport set_morph_from_dict
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -194,29 +197,39 @@ POS_TEMPLATES = (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct _CachedMorph:
|
||||||
|
Morphology morph
|
||||||
|
int lemma
|
||||||
|
|
||||||
|
|
||||||
cdef class EnPosTagger(Tagger):
|
cdef class EnPosTagger(Tagger):
|
||||||
def __init__(self, StringStore strings, data_dir):
|
def __init__(self, StringStore strings, data_dir):
|
||||||
model_dir = path.join(data_dir, 'pos')
|
model_dir = path.join(data_dir, 'pos')
|
||||||
Tagger.__init__(self, path.join(model_dir))
|
Tagger.__init__(self, path.join(model_dir))
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||||
self.tags = StringStore()
|
self.tag_names = sorted(cfg['tag_names'])
|
||||||
for tag in sorted(cfg['tag_names']):
|
self.tag_map = cfg['tag_map']
|
||||||
_ = self.tags[tag]
|
cdef int n_tags = len(self.tag_names) + 1
|
||||||
self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
|
self._morph_cache = PreshMapArray(n_tags)
|
||||||
cfg['tag_map'],
|
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
|
||||||
Lemmatizer(path.join(data_dir, 'wordnet'),
|
for i, tag in enumerate(sorted(self.tag_names)):
|
||||||
NOUN, VERB, ADJ))
|
pos, props = self.tag_map[tag]
|
||||||
|
self.tags[i].id = i
|
||||||
|
self.tags[i].pos = pos
|
||||||
|
set_morph_from_dict(&self.tags[i].morph, props)
|
||||||
|
if path.exists(path.join(data_dir, 'morphs.json')):
|
||||||
|
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'morphs.json'))))
|
||||||
|
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||||
|
|
||||||
def __call__(self, Tokens tokens):
|
def __call__(self, Tokens tokens):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
cdef TokenC* t = tokens.data
|
cdef TokenC* t = tokens.data
|
||||||
assert self.morphologizer is not None
|
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_context(context, i, t)
|
fill_context(context, i, t)
|
||||||
t[i].pos = self.predict(context)
|
t[i].pos = self.predict(context)
|
||||||
self.morphologizer.set_morph(i, t)
|
self.set_morph(i, t)
|
||||||
|
|
||||||
def train(self, Tokens tokens, golds):
|
def train(self, Tokens tokens, golds):
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -226,10 +239,53 @@ cdef class EnPosTagger(Tagger):
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_context(context, i, t)
|
fill_context(context, i, t)
|
||||||
t[i].pos = self.predict(context, [golds[i]])
|
t[i].pos = self.predict(context, [golds[i]])
|
||||||
self.morphologizer.set_morph(i, t)
|
self.set_morph(i, t)
|
||||||
c += t[i].pos == golds[i]
|
c += t[i].pos == golds[i]
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||||
|
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||||
|
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
|
||||||
|
if cached is NULL:
|
||||||
|
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||||
|
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||||
|
cached.morph = tag.morph
|
||||||
|
self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||||
|
tokens[i].lemma = cached.lemma
|
||||||
|
tokens[i].morph = cached.morph
|
||||||
|
|
||||||
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||||
|
if self.lemmatizer is None:
|
||||||
|
return lex.sic
|
||||||
|
cdef bytes py_string = self.strings[lex.sic]
|
||||||
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||||
|
return lex.sic
|
||||||
|
cdef set lemma_strings
|
||||||
|
cdef bytes lemma_string
|
||||||
|
lemma_strings = self.lemmatizer(py_string, pos)
|
||||||
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
|
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
def load_morph_exceptions(self, dict exc):
|
||||||
|
cdef unicode pos_str
|
||||||
|
cdef unicode form_str
|
||||||
|
cdef unicode lemma_str
|
||||||
|
cdef dict entries
|
||||||
|
cdef dict props
|
||||||
|
cdef int lemma
|
||||||
|
cdef id_t sic
|
||||||
|
cdef int pos
|
||||||
|
for pos_str, entries in exc.items():
|
||||||
|
pos = self.tag_names.index(pos_str)
|
||||||
|
for form_str, props in entries.items():
|
||||||
|
lemma_str = props.get('L', form_str)
|
||||||
|
sic = self.strings[form_str]
|
||||||
|
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||||
|
cached.lemma = self.strings[lemma_str]
|
||||||
|
set_morph_from_dict(&cached.morph, props)
|
||||||
|
self._morph_cache.set(pos, sic, <void*>cached)
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||||
|
|
|
@ -1,21 +1,4 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from preshed.maps cimport PreshMapArray
|
|
||||||
|
|
||||||
from .structs cimport TokenC, Lexeme, Morphology, PosTag
|
from .structs cimport TokenC, Lexeme, Morphology, PosTag
|
||||||
from .strings cimport StringStore
|
|
||||||
from .typedefs cimport id_t, univ_tag_t
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphologizer:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef StringStore strings
|
|
||||||
cdef object lemmatizer
|
|
||||||
cdef PosTag* tags
|
|
||||||
cdef readonly list tag_names
|
|
||||||
|
|
||||||
cdef PreshMapArray _cache
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
|
||||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
|
||||||
|
|
|
@ -1,105 +1,5 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
from os import path
|
|
||||||
import json
|
|
||||||
|
|
||||||
from .typedefs cimport id_t, univ_tag_t
|
|
||||||
from .typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT
|
|
||||||
from .typedefs cimport VERB, X, PUNCT, EOL
|
|
||||||
from . import util
|
|
||||||
|
|
||||||
|
|
||||||
UNIV_TAGS = {
|
|
||||||
'NULL': NO_TAG,
|
|
||||||
'ADJ': ADJ,
|
|
||||||
'ADV': ADV,
|
|
||||||
'ADP': ADP,
|
|
||||||
'CONJ': CONJ,
|
|
||||||
'DET': DET,
|
|
||||||
'NOUN': NOUN,
|
|
||||||
'NUM': NUM,
|
|
||||||
'PRON': PRON,
|
|
||||||
'PRT': PRT,
|
|
||||||
'VERB': VERB,
|
|
||||||
'X': X,
|
|
||||||
'.': PUNCT,
|
|
||||||
'EOL': EOL
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct _Cached:
|
|
||||||
Morphology morph
|
|
||||||
int lemma
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphologizer:
|
|
||||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
|
||||||
"""
|
|
||||||
def __init__(self, StringStore strings, object tag_names, object tag_map,
|
|
||||||
object lemmatizer, irregulars=None):
|
|
||||||
self.mem = Pool()
|
|
||||||
self.strings = strings
|
|
||||||
self.lemmatizer = lemmatizer
|
|
||||||
cdef int n_tags = len(tag_names) + 1
|
|
||||||
self._cache = PreshMapArray(n_tags)
|
|
||||||
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
|
|
||||||
for i, tag in enumerate(sorted(tag_names)):
|
|
||||||
pos, props = tag_map[tag]
|
|
||||||
self.tags[i].id = i
|
|
||||||
self.tags[i].pos = pos
|
|
||||||
self.tags[i].morph.number = props.get('number', 0)
|
|
||||||
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
|
||||||
self.tags[i].morph.mood = props.get('mood', 0)
|
|
||||||
self.tags[i].morph.gender = props.get('gender', 0)
|
|
||||||
self.tags[i].morph.person = props.get('person', 0)
|
|
||||||
self.tags[i].morph.case = props.get('case', 0)
|
|
||||||
self.tags[i].morph.misc = props.get('misc', 0)
|
|
||||||
if irregulars is not None:
|
|
||||||
self.load_exceptions(irregulars)
|
|
||||||
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
|
||||||
if self.lemmatizer is None:
|
|
||||||
return lex.sic
|
|
||||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
|
||||||
return lex.sic
|
|
||||||
cdef bytes py_string = self.strings[lex.sic]
|
|
||||||
cdef set lemma_strings
|
|
||||||
cdef bytes lemma_string
|
|
||||||
lemma_strings = self.lemmatizer(py_string, pos)
|
|
||||||
lemma_string = sorted(lemma_strings)[0]
|
|
||||||
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
|
||||||
return lemma
|
|
||||||
|
|
||||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
|
||||||
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
|
||||||
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
|
|
||||||
if cached is NULL:
|
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
|
||||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
|
||||||
cached.morph = tag.morph
|
|
||||||
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
|
||||||
tokens[i].lemma = cached.lemma
|
|
||||||
tokens[i].morph = cached.morph
|
|
||||||
|
|
||||||
def load_exceptions(self, dict exc):
|
|
||||||
cdef unicode pos_str
|
|
||||||
cdef unicode form_str
|
|
||||||
cdef unicode lemma_str
|
|
||||||
cdef dict entries
|
|
||||||
cdef dict props
|
|
||||||
cdef int lemma
|
|
||||||
cdef id_t sic
|
|
||||||
cdef int pos
|
|
||||||
for pos_str, entries in exc.items():
|
|
||||||
pos = self.strings.pos_tags[pos_str]
|
|
||||||
assert pos < len(self.strings.pos_tags)
|
|
||||||
for form_str, props in entries.items():
|
|
||||||
lemma_str = props.get('L', form_str)
|
|
||||||
sic = self.strings[form_str]
|
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
|
||||||
cached.lemma = self.strings[lemma_str]
|
|
||||||
set_morph_from_dict(&cached.morph, props)
|
|
||||||
self._cache.set(pos, sic, <void*>cached)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||||
|
|
|
@ -10,11 +10,6 @@ def EN():
|
||||||
def tagged(EN):
|
def tagged(EN):
|
||||||
string = u'Bananas in pyjamas are geese.'
|
string = u'Bananas in pyjamas are geese.'
|
||||||
tokens = EN(string, tag=True)
|
tokens = EN(string, tag=True)
|
||||||
assert EN.tagger.tags[tokens[0].pos] == 'NNP'
|
|
||||||
assert EN.tagger.tags[tokens[1].pos] == 'IN'
|
|
||||||
assert EN.tagger.tags[tokens[2].pos] == 'NNS'
|
|
||||||
assert EN.tagger.tags[tokens[3].pos] == 'VBP'
|
|
||||||
assert EN.tagger.tags[tokens[3].pos] == 'NNS'
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ def test_read_exc():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer():
|
def lemmatizer():
|
||||||
return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
|
return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
def test_noun_lemmas(lemmatizer):
|
def test_noun_lemmas(lemmatizer):
|
||||||
|
|
|
@ -18,8 +18,8 @@ def morph_exc():
|
||||||
}
|
}
|
||||||
|
|
||||||
def test_load_exc(EN, morph_exc):
|
def test_load_exc(EN, morph_exc):
|
||||||
EN.tagger.morphologizer.load_exceptions(morph_exc)
|
EN.tagger.load_morph_exceptions(morph_exc)
|
||||||
tokens = EN('I like his style.', tag=True)
|
tokens = EN('I like his style.', tag=True)
|
||||||
his = tokens[2]
|
his = tokens[2]
|
||||||
assert his.pos == 'PRP$'
|
assert EN.tagger.tag_names[his.pos] == 'PRP$'
|
||||||
assert his.lemma == '-PRP-'
|
assert his.lemma == '-PRP-'
|
||||||
|
|
|
@ -4,10 +4,10 @@ import pytest
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
|
|
||||||
def test_only_pre1():
|
def test_only_pre1():
|
||||||
EN = English()
|
EN = English(tag=False, parse=False)
|
||||||
assert len(EN("(")) == 1
|
assert len(EN("(")) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_only_pre2():
|
def test_only_pre2():
|
||||||
EN = English()
|
EN = English(tag=False, parse=False)
|
||||||
assert len(EN("((")) == 2
|
assert len(EN("((")) == 2
|
||||||
|
|
|
@ -12,7 +12,7 @@ def open_puncts():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def EN():
|
def EN():
|
||||||
return English()
|
return English(tag=False, parse=False)
|
||||||
|
|
||||||
|
|
||||||
def test_open(open_puncts, EN):
|
def test_open(open_puncts, EN):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en import EN
|
from spacy.en import English
|
||||||
from spacy.util import utf8open
|
from spacy.util import utf8open
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -18,6 +18,7 @@ def sun_txt():
|
||||||
|
|
||||||
|
|
||||||
def test_tokenize(sun_txt):
|
def test_tokenize(sun_txt):
|
||||||
|
nlp = English(tag=False, parse=False)
|
||||||
assert len(sun_txt) != 0
|
assert len(sun_txt) != 0
|
||||||
tokens = EN.tokenize(sun_txt)
|
tokens = nlp(sun_txt)
|
||||||
assert True
|
assert True
|
||||||
|
|
Loading…
Reference in New Issue
Block a user