mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* More work on language independent parsing
This commit is contained in:
parent
c2307fa9ee
commit
534e3dda3c
|
@ -6,7 +6,6 @@ except ImportError:
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .morphology import Morphology
|
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .syntax.parser import Parser
|
from .syntax.parser import Parser
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
|
@ -132,16 +131,12 @@ class Language(object):
|
||||||
def default_data_dir(cls):
|
def default_data_dir(cls):
|
||||||
return path.join(path.dirname(__file__), 'data')
|
return path.join(path.dirname(__file__), 'data')
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def default_morphology(cls, data_dir):
|
|
||||||
return Morphology.from_dir(data_dir)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_vectors(cls, data_dir):
|
def default_vectors(cls, data_dir):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None):
|
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
|
||||||
if data_dir is None:
|
if data_dir is None:
|
||||||
data_dir = cls.default_data_dir()
|
data_dir = cls.default_data_dir()
|
||||||
if vectors is None:
|
if vectors is None:
|
||||||
|
|
|
@ -22,7 +22,7 @@ cdef struct MorphAnalysisC:
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
cdef readonly Pool mem
|
cdef readonly Pool mem
|
||||||
cdef readonly object strings
|
cdef readonly StringStore strings
|
||||||
cdef public object lemmatizer
|
cdef public object lemmatizer
|
||||||
cdef public object n_tags
|
cdef public object n_tags
|
||||||
cdef public object reverse_index
|
cdef public object reverse_index
|
||||||
|
|
|
@ -11,20 +11,15 @@ from .parts_of_speech cimport ADJ, VERB, NOUN
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
@classmethod
|
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
||||||
def from_dir(cls, data_dir, lemmatizer=None):
|
|
||||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
|
||||||
if lemmatizer is None:
|
|
||||||
lemmatizer = Lemmatizer.from_dir(data_dir)
|
|
||||||
return cls(tag_map, {}, lemmatizer)
|
|
||||||
|
|
||||||
def __init__(self, string_store, tag_map, lemmatizer):
|
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_store
|
self.strings = string_store
|
||||||
self.lemmatizer = lemmatizer
|
self.lemmatizer = lemmatizer
|
||||||
self.n_tags = len(tag_map)
|
self.n_tags = len(tag_map) + 1
|
||||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
|
|
||||||
|
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||||
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
||||||
self.rich_tags[i].id = i
|
self.rich_tags[i].id = i
|
||||||
self.rich_tags[i].name = self.strings[tag_str]
|
self.rich_tags[i].name = self.strings[tag_str]
|
||||||
|
@ -33,12 +28,16 @@ cdef class Morphology:
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
|
cdef int tag_id
|
||||||
|
if isinstance(tag, basestring):
|
||||||
|
tag_id = self.reverse_index[self.strings[tag]]
|
||||||
|
else:
|
||||||
|
tag_id = tag
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
analysis.tag = self.rich_tags[tag_id]
|
analysis.tag = self.rich_tags[tag_id]
|
||||||
analysis.lemma = self.lemmatize(tag, token.lex.orth)
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
|
||||||
token.lemma = analysis.lemma
|
token.lemma = analysis.lemma
|
||||||
token.pos = analysis.tag.pos
|
token.pos = analysis.tag.pos
|
||||||
token.tag = analysis.tag.name
|
token.tag = analysis.tag.name
|
||||||
|
|
|
@ -104,7 +104,7 @@ cdef class Tagger:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def blank(cls, vocab, templates):
|
def blank(cls, vocab, templates):
|
||||||
model = Model(vocab.n_tags, templates, model_loc=None)
|
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
||||||
return cls(vocab, model)
|
return cls(vocab, model)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -113,7 +113,7 @@ cdef class Tagger:
|
||||||
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
||||||
else:
|
else:
|
||||||
templates = cls.default_templates()
|
templates = cls.default_templates()
|
||||||
model = Model(vocab.n_tags, templates, data_dir)
|
model = Model(vocab.morphology.n_tags, templates, data_dir)
|
||||||
return cls(vocab, model)
|
return cls(vocab, model)
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, model):
|
def __init__(self, Vocab vocab, model):
|
||||||
|
@ -128,7 +128,7 @@ cdef class Tagger:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tag_names(self):
|
def tag_names(self):
|
||||||
return self.vocab.tag_names
|
return self.vocab.morphology.tag_names
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
|
|
@ -49,13 +49,15 @@ cdef class Vocab:
|
||||||
self._serializer = None
|
self._serializer = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, data_dir, get_lex_attr=None, morphology=None, vectors=None):
|
def from_dir(cls, data_dir, get_lex_attr=None, vectors=None):
|
||||||
if not path.exists(data_dir):
|
if not path.exists(data_dir):
|
||||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||||
if not path.isdir(data_dir):
|
if not path.isdir(data_dir):
|
||||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||||
|
|
||||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
|
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
|
||||||
|
|
||||||
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
|
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
|
||||||
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
||||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user