mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* More work on language independent parsing
This commit is contained in:
parent
c2307fa9ee
commit
534e3dda3c
|
@ -6,7 +6,6 @@ except ImportError:
|
|||
import json
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .morphology import Morphology
|
||||
from .vocab import Vocab
|
||||
from .syntax.parser import Parser
|
||||
from .tagger import Tagger
|
||||
|
@ -132,16 +131,12 @@ class Language(object):
|
|||
def default_data_dir(cls):
|
||||
return path.join(path.dirname(__file__), 'data')
|
||||
|
||||
@classmethod
|
||||
def default_morphology(cls, data_dir):
|
||||
return Morphology.from_dir(data_dir)
|
||||
|
||||
@classmethod
|
||||
def default_vectors(cls, data_dir):
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None):
|
||||
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
|
||||
if data_dir is None:
|
||||
data_dir = cls.default_data_dir()
|
||||
if vectors is None:
|
||||
|
|
|
@ -22,7 +22,7 @@ cdef struct MorphAnalysisC:
|
|||
|
||||
cdef class Morphology:
|
||||
cdef readonly Pool mem
|
||||
cdef readonly object strings
|
||||
cdef readonly StringStore strings
|
||||
cdef public object lemmatizer
|
||||
cdef public object n_tags
|
||||
cdef public object reverse_index
|
||||
|
|
|
@ -11,20 +11,15 @@ from .parts_of_speech cimport ADJ, VERB, NOUN
|
|||
|
||||
|
||||
cdef class Morphology:
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, lemmatizer=None):
|
||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||
if lemmatizer is None:
|
||||
lemmatizer = Lemmatizer.from_dir(data_dir)
|
||||
return cls(tag_map, {}, lemmatizer)
|
||||
|
||||
def __init__(self, string_store, tag_map, lemmatizer):
|
||||
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
||||
self.mem = Pool()
|
||||
self.strings = string_store
|
||||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map)
|
||||
self.n_tags = len(tag_map) + 1
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.reverse_index = {}
|
||||
|
||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
||||
self.rich_tags[i].id = i
|
||||
self.rich_tags[i].name = self.strings[tag_str]
|
||||
|
@ -33,12 +28,16 @@ cdef class Morphology:
|
|||
self._cache = PreshMapArray(self.n_tags)
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
|
||||
cdef int tag_id
|
||||
if isinstance(tag, basestring):
|
||||
tag_id = self.reverse_index[self.strings[tag]]
|
||||
else:
|
||||
tag_id = tag
|
||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||
if analysis is NULL:
|
||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||
analysis.tag = self.rich_tags[tag_id]
|
||||
analysis.lemma = self.lemmatize(tag, token.lex.orth)
|
||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
|
||||
token.lemma = analysis.lemma
|
||||
token.pos = analysis.tag.pos
|
||||
token.tag = analysis.tag.name
|
||||
|
|
|
@ -104,7 +104,7 @@ cdef class Tagger:
|
|||
|
||||
@classmethod
|
||||
def blank(cls, vocab, templates):
|
||||
model = Model(vocab.n_tags, templates, model_loc=None)
|
||||
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
||||
return cls(vocab, model)
|
||||
|
||||
@classmethod
|
||||
|
@ -113,7 +113,7 @@ cdef class Tagger:
|
|||
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
||||
else:
|
||||
templates = cls.default_templates()
|
||||
model = Model(vocab.n_tags, templates, data_dir)
|
||||
model = Model(vocab.morphology.n_tags, templates, data_dir)
|
||||
return cls(vocab, model)
|
||||
|
||||
def __init__(self, Vocab vocab, model):
|
||||
|
@ -128,7 +128,7 @@ cdef class Tagger:
|
|||
|
||||
@property
|
||||
def tag_names(self):
|
||||
return self.vocab.tag_names
|
||||
return self.vocab.morphology.tag_names
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
|
|
@ -49,13 +49,15 @@ cdef class Vocab:
|
|||
self._serializer = None
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, get_lex_attr=None, morphology=None, vectors=None):
|
||||
def from_dir(cls, data_dir, get_lex_attr=None, vectors=None):
|
||||
if not path.exists(data_dir):
|
||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
|
||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
|
||||
|
||||
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
|
||||
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||
|
|
Loading…
Reference in New Issue
Block a user