* More work on language-generic parsing

This commit is contained in:
Matthew Honnibal 2015-08-28 02:02:33 +02:00
parent 86c4a8e3e2
commit c2307fa9ee
12 changed files with 129 additions and 222 deletions

11
spacy/fi/__init__.py Normal file
View File

@ -0,0 +1,11 @@
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
class Finnish(Language):
@classmethod
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')

View File

@ -148,13 +148,10 @@ class Language(object):
vectors = cls.default_vectors(data_dir)
if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs(data_dir)
if morphology is None:
morphology = cls.default_morphology(path.join(data_dir, 'vocab'))
return Vocab.from_dir(
path.join(data_dir, 'vocab'),
get_lex_attr=get_lex_attr,
vectors=vectors,
morphology=morphology)
vectors=vectors)
@classmethod
def default_tokenizer(cls, vocab, data_dir):

View File

@ -1,18 +1,41 @@
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray
from libc.stdint cimport uint64_t
from .structs cimport TokenC
from .strings cimport StringStore
from .typedefs cimport attr_t
from .parts_of_speech cimport univ_pos_t
cdef struct RichTagC:
uint64_t morph
int id
univ_pos_t pos
attr_t name
cdef struct MorphAnalysisC:
RichTagC tag
attr_t lemma
cdef class Morphology:
cdef readonly Pool mem
cdef readonly object strings
cdef public object lemmatizer
cdef public object tag_map
cdef public object n_tags
cdef public object reverse_index
cdef public object tag_names
cdef public object tag_ids
cdef public int n_tags
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
cdef RichTagC* rich_tags
cdef PreshMapArray _cache
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
cdef int assign_from_dict(self, TokenC* token, props) except -1
#
#cpdef enum Feature_t:

View File

@ -6,13 +6,8 @@ try:
except ImportError:
import json
from spacy.parts_of_speech import UNIV_POS_NAMES
cdef struct MorphAnalysisC:
uint64_t[4] features
attr_t lemma
attr_t pos
from .parts_of_speech import UNIV_POS_NAMES
from .parts_of_speech cimport ADJ, VERB, NOUN
cdef class Morphology:
@ -23,32 +18,37 @@ cdef class Morphology:
lemmatizer = Lemmatizer.from_dir(data_dir)
return cls(tag_map, {}, lemmatizer)
def __init__(self, tag_map, fused_tokens, lemmatizer):
def __init__(self, string_store, tag_map, lemmatizer):
self.mem = Pool()
self.strings = string_store
self.lemmatizer = lemmatizer
self.tag_map = tag_map
self.n_tags = len(tag_map)
self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_ids = {}
for i, tag_str in enumerate(self.tag_names):
self.tag_ids[tag_str] = i
self._cache = PreshMapArray()
self.reverse_index = {}
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].morph = 0
self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags)
cdef int assign_tag(self, TokenC* token, tag) except -1:
analysis = <MorphAnalysisC*>self._cache.get(tag, token.lex.orth)
cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
cached = self.decode_tag(tag)
cached.lemma = self.lemmatize(token.pos, token.lex)
analysis.tag = self.rich_tags[tag_id]
analysis.lemma = self.lemmatize(tag, token.lex.orth)
token.lemma = analysis.lemma
token.pos = analysis.pos
token.tag = analysis.tag
token.morph = analysis.features
token.pos = analysis.tag.pos
token.tag = analysis.tag.name
token.morph = analysis.tag.morph
cdef int assign_feature(self, TokenC* token, feature, value) except -1:
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
pass
def load_morph_exceptions(self, dict exc):
# Map (form, pos) to (lemma, inflection)
# Map (form, pos) to (lemma, rich tag)
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
@ -57,121 +57,30 @@ cdef class Morphology:
cdef int lemma
cdef attr_t orth
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for tag_str, entries in exc.items():
tag = self.strings[tag_str]
rich_tag = self.rich_tags[self.reverse_index[tag]]
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
orth = self.strings[form_str]
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
cached.lemma = self.strings[lemma_str]
self.set_features(cached, props)
self._cache.set(pos, orth, <void*>cached)
orth = self.strings[form_str]
for name_str, value_str in props.items():
if name_str == 'L':
cached.lemma = self.strings[value_str]
else:
self.assign_feature(&cached.tag.morph, name_str, value_str)
if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth)
self._cache.set(rich_tag.pos, orth, <void*>cached)
def _load_special_tokenization(self, special_cases):
'''Add a special-case tokenization rule.
'''
cdef int i
cdef list substrings
cdef unicode chunk
cdef unicode form
cdef unicode lemma
cdef dict props
cdef LexemeC** lexemes
cdef hash_t hashed
for chunk, substrings in sorted(special_cases.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
# Set the special tokens up to have morphology and lemmas if
# specified, otherwise use the part-of-speech tag (if specified)
form = props['F']
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
morphology = self.vocab.morphology.decode_dict(props)
tokens[i].lemma = morph_analysis.lemma
tokens[i].pos = morph_analysis.pos
tokens[i].tag = morph_analysis.tag
tokens[i].morph = morph_analysis.morph
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = tokens
hashed = hash_string(chunk)
self._specials.set(hashed, cached)
self._cache.set(hashed, cached)
#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
# morph.number = props.get('number', 0)
# morph.tenspect = props.get('tenspect', 0)
# morph.mood = props.get('mood', 0)
# morph.gender = props.get('gender', 0)
# morph.person = props.get('person', 0)
# morph.case = props.get('case', 0)
# morph.misc = props.get('misc', 0)
#
#
#cdef class Morphology:
# cdef Pool mem
# cdef PreshMap table
#
# def __init__(self, tags, exceptions):
# pass
#
# def __getitem__(self, hash_t id_):
# pass
#
# cdef const InflectionC* get(self, hash_t key) except NULL:
# pass
#
# cdef MorphAnalysis analyse(const TokenC* token) except -1:
# cdef struct MorphAnalysis morphology
# tokens[i].pos = tag.pos
# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
# if cached is NULL:
# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
# cached.morph = tag.morph
# self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
# tokens[i].lemma = cached.lemma
# tokens[i].morph = cached.morph
#
# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
# if self.lemmatizer is None:
# return lex.orth
# cdef unicode py_string = self.strings[lex.orth]
# if pos != NOUN and pos != VERB and pos != ADJ:
# return lex.orth
# cdef set lemma_strings
# cdef unicode lemma_string
# lemma_strings = self.lemmatizer(py_string, pos)
# lemma_string = sorted(lemma_strings)[0]
# lemma = self.strings[lemma_string]
# return lemma
#
#
#cdef class Inflection:
# cdef InflectionC* c
#
# def __init__(self, container, id_):
# self.c = container[id_]
# self.container = container
#
# for i, feat_id in enumerate(feat_ids):
# feature, value = parse_id(feat_id)
# self.add_value(feature, value, True)
#
# def has(self, Value_t feat_value_id):
# part = feat_value_id % 64
# bit = feat_value_id / 64
# if self.value_set[part] & bit:
# return True
# else:
# return False
#
# property pos: def __get__(self): return self.c.pos
#
# property id: def __get__(self): return self.c.id
#
# property features:
# pass
def lemmatize(self, const univ_pos_t pos, attr_t orth):
if self.lemmatizer is None:
return orth
cdef unicode py_string = self.strings[orth]
if pos != NOUN and pos != VERB and pos != ADJ:
return orth
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
return lemma

View File

@ -25,17 +25,6 @@ cdef struct LexemeC:
float sentiment
float l2_norm
cdef struct MorphFeatC:
int name
int value
cdef struct MorphologyC:
uint64_t[4] feature_set
MorphFeatC* features
univ_pos_t pos
int n
cdef struct Entity:
int start
@ -54,8 +43,8 @@ cdef struct Constituent:
cdef struct TokenC:
const LexemeC* lex
const MorphologyC* morph
const Constituent* ctnt
uint64_t morph
univ_pos_t pos
bint spacy
int tag

View File

@ -104,7 +104,7 @@ cdef class Tagger:
@classmethod
def blank(cls, vocab, templates):
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
model = Model(vocab.n_tags, templates, model_loc=None)
return cls(vocab, model)
@classmethod
@ -113,7 +113,7 @@ cdef class Tagger:
templates = json.loads(open(path.join(data_dir, 'templates.json')))
else:
templates = cls.default_templates()
model = Model(vocab.morphology.n_tags, templates, data_dir)
model = Model(vocab.n_tags, templates, data_dir)
return cls(vocab, model)
def __init__(self, Vocab vocab, model):
@ -128,7 +128,7 @@ cdef class Tagger:
@property
def tag_names(self):
return self.vocab.morphology.tag_names
return self.vocab.tag_names
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
@ -143,14 +143,15 @@ cdef class Tagger:
for i in range(tokens.length):
if tokens.data[i].pos == 0:
guess = self.predict(i, tokens.data)
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
@ -168,7 +169,9 @@ cdef class Tagger:
for i in range(tokens.length):
guess = self.update(i, tokens.data, golds[i])
loss = golds[i] != -1 and guess != golds[i]
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct

View File

@ -7,12 +7,7 @@ from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC
from .strings cimport StringStore
from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached
cdef union LexemesOrTokens:
const LexemeC* const* lexemes
TokenC* tokens
from .vocab cimport Vocab, LexemesOrTokens, _Cached
cdef class Tokenizer:

View File

@ -192,9 +192,7 @@ cdef class Tokenizer:
tokens.push_back(prefixes[0][i], False)
if string:
cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit:
pass
else:
if not cache_hit:
match = self.find_infix(string)
if match is None:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
@ -253,38 +251,10 @@ cdef class Tokenizer:
cdef LexemeC** lexemes
cdef hash_t hashed
for chunk, substrings in sorted(special_cases.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
form = props['F']
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
lemma = props.get('L', form)
tokens[i].lemma = self.vocab.strings[lemma]
#TODO
#self.vocab.morphology.assign_from_dict(&tokens[i], props)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = tokens
hashed = hash_string(chunk)
self._specials.set(hashed, cached)
self._cache.set(hashed, cached)
#if lemma is not None:
# tokens[i].lemma = self.vocab.strings[lemma]
#else:
# tokens[i].lemma = 0
#if 'pos' in props:
# inflection = self.vocab.morphology.get(props['pos'])
# inflection.assign(&tokens[i])
# # These are defaults, which can be over-ridden by the
# # token-specific props.
# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
# #tokens[i].pos = pos
# ## These are defaults, which can be over-ridden by the
# ## token-specific props.
# #set_morph_from_dict(&tokens[i].morph, morph_features)
# #if tokens[i].lemma == 0:
# # tokens[i].lemma = tokens[i].lex.orth
##set_morph_from_dict(&tokens[i].morph, props)
cached.data.tokens = self.vocab.make_fused_token(substrings)
key = hash_string(chunk)
self._specials.set(key, cached)
self._cache.set(key, cached)

View File

@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr
ctypedef const TokenC* const_TokenC_ptr
ctypedef fused LexemeOrToken:
const_Lexeme_ptr
TokenC_ptr
const_TokenC_ptr
cdef class Doc:

View File

@ -209,7 +209,7 @@ cdef class Doc:
if self.length == self.max_length:
self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length]
if LexemeOrToken is TokenC_ptr:
if LexemeOrToken is const_TokenC_ptr:
t[0] = lex_or_tok[0]
else:
t.lex = lex_or_tok

View File

@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME
cdef union LexemesOrTokens:
const LexemeC* const* lexemes
TokenC* tokens
const TokenC* tokens
cdef struct _Cached:
@ -37,6 +37,7 @@ cdef class Vocab:
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1

View File

@ -17,6 +17,7 @@ from .strings cimport hash_string
from .orth cimport word_shape
from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from cymem.cymem cimport Address
from . import util
@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab:
'''A map container for a language's LexemeC structs.
'''
@classmethod
def default_morphology(cls):
return Morphology({'VBZ': ['VERB', {}]}, [], None)
def __init__(self, get_lex_attr=None, morphology=None, vectors=None):
self.get_lex_attr = get_lex_attr
if morphology is None:
morphology = self.default_morphology()
self.morphology = morphology
def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
self.get_lex_attr = get_lex_attr
self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
self.length = 1
self._serializer = None
@ -60,10 +54,9 @@ cdef class Vocab:
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors,
morphology=morphology)
self.load_lexemes(path.join(data_dir, 'strings.txt'),
path.join(data_dir, 'lexemes.bin'))
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
return self
@ -172,6 +165,22 @@ cdef class Vocab:
orth = id_or_string
return Lexeme(self, orth)
cdef const TokenC* make_fused_token(self, substrings) except NULL:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
token = &tokens[i]
# Set the special tokens up to have morphology and lemmas if
# specified, otherwise use the part-of-speech tag (if specified)
token.lex = <LexemeC*>self.get(self.mem, props['F'])
if 'pos' in props:
self.morphology.assign_tag(token, props['pos'])
if 'L' in props:
tokens[i].lemma = self.strings[props['L']]
for feature, value in props.get('morph', {}).items():
self.morphology.assign_feature(&token.morph, feature, value)
return tokens
def dump(self, loc):
if path.exists(loc):
assert not path.isdir(loc)