mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
* Refactor morphology.pyx
This commit is contained in:
parent
4c6ce7ee84
commit
4e30195c6d
|
@ -1,36 +1,9 @@
|
|||
|
||||
from .tokens cimport TokenC
|
||||
from .lexeme cimport Lexeme
|
||||
from .utf8string cimport StringStore
|
||||
from .typedefs cimport id_t, Morphology
|
||||
|
||||
from preshed.maps cimport PreshMapArray
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMapArray
|
||||
|
||||
|
||||
# Google universal tag set
|
||||
cpdef enum univ_tag_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADV
|
||||
ADP
|
||||
CONJ
|
||||
DET
|
||||
NOUN
|
||||
NUM
|
||||
PRON
|
||||
PRT
|
||||
VERB
|
||||
X
|
||||
PUNCT
|
||||
EOL
|
||||
N_UNIV_TAGS
|
||||
|
||||
|
||||
cdef struct PosTag:
|
||||
Morphology morph
|
||||
int id
|
||||
univ_tag_t pos
|
||||
from .structs cimport TokenC, Lexeme, Morphology, PosTag
|
||||
from .strings cimport StringStore
|
||||
from .typedefs cimport id_t, univ_tag_t
|
||||
|
||||
|
||||
cdef class Morphologizer:
|
||||
|
|
|
@ -4,7 +4,9 @@ from os import path
|
|||
import json
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .typedefs cimport id_t
|
||||
from .typedefs cimport id_t, univ_tag_t
|
||||
from .typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT
|
||||
from .typedefs cimport VERB, X, PUNCT, EOL
|
||||
from . import util
|
||||
|
||||
|
||||
|
@ -34,13 +36,12 @@ cdef struct _Cached:
|
|||
cdef class Morphologizer:
|
||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||
"""
|
||||
def __init__(self, StringStore strings, data_dir):
|
||||
def __init__(self, StringStore strings, object lemmatizer, **kwargs):
|
||||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
cfg = json.load(open(path.join(data_dir, 'config.json')))
|
||||
tag_map = cfg['tag_map']
|
||||
self.tag_names = cfg['tag_names']
|
||||
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
|
||||
tag_map = kwargs['tag_map']
|
||||
self.tag_names = kwargs['tag_names']
|
||||
self.lemmatizer = lemmatizer
|
||||
self._cache = PreshMapArray(len(self.tag_names))
|
||||
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||
for i, tag in enumerate(self.tag_names):
|
||||
|
@ -54,9 +55,9 @@ cdef class Morphologizer:
|
|||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
if path.exists(path.join(data_dir, 'morphs.json')):
|
||||
with open(path.join(data_dir, 'morphs.json')) as file_:
|
||||
self.load_exceptions(json.load(file_))
|
||||
#if path.exists(path.join(data_dir, 'morphs.json')):
|
||||
# with open(path.join(data_dir, 'morphs.json')) as file_:
|
||||
# self.load_exceptions(json.load(file_))
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
|
|
Loading…
Reference in New Issue
Block a user