mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* Update Lexicon class to expect a list of lexeme dict descriptions
This commit is contained in:
parent
51d75b244b
commit
e40caae51f
|
@ -15,7 +15,7 @@ import re
|
|||
|
||||
from .util import read_lang_data
|
||||
from spacy.tokens import Tokens
|
||||
from spacy.lexeme cimport LexemeC, lexeme_init
|
||||
from spacy.lexeme cimport LexemeC, lexeme_init, lexeme_pack, lexeme_unpack
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from cpython.ref cimport Py_INCREF
|
||||
|
@ -25,7 +25,6 @@ from cymem.cymem cimport Pool
|
|||
from cython.operator cimport preincrement as preinc
|
||||
from cython.operator cimport dereference as deref
|
||||
|
||||
|
||||
from preshed.maps cimport PreshMap
|
||||
from spacy import orth
|
||||
from spacy import util
|
||||
|
@ -69,7 +68,6 @@ cdef enum Views:
|
|||
View_N
|
||||
|
||||
|
||||
|
||||
# Assign the flag and view functions by enum value.
|
||||
# This is verbose, but it ensures we don't get nasty order sensitivities.
|
||||
STRING_VIEW_FUNCS = [None] * View_N
|
||||
|
@ -107,8 +105,6 @@ FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7)
|
|||
FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7)
|
||||
|
||||
|
||||
|
||||
|
||||
cdef class Language:
|
||||
"""Base class for language-specific tokenizers.
|
||||
|
||||
|
@ -127,23 +123,19 @@ cdef class Language:
|
|||
fl_is_digit = Flag_IsDigit
|
||||
v_shape = View_WordShape
|
||||
|
||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||
def __init__(self, name, user_string_features, user_flag_features):
|
||||
self.name = name
|
||||
self._mem = Pool()
|
||||
self.cache = PreshMap(2 ** 25)
|
||||
self.specials = PreshMap(2 ** 16)
|
||||
lang_data = util.read_lang_data(name)
|
||||
rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||
rules, prefix, suffix, lexemes = util.read_lang_data(name)
|
||||
self.prefix_re = re.compile(prefix)
|
||||
self.suffix_re = re.compile(suffix)
|
||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||
self.lexicon = Lexicon(lexemes,
|
||||
STRING_VIEW_FUNCS + user_string_features,
|
||||
FLAG_FUNCS + user_flag_features)
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
def __dealloc__(self):
|
||||
pass
|
||||
|
||||
property nr_types:
|
||||
def __get__(self):
|
||||
"""Return the number of lexical types in the vocabulary"""
|
||||
|
@ -347,27 +339,20 @@ cdef class Language:
|
|||
|
||||
|
||||
cdef class Lexicon:
|
||||
def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
|
||||
string_features, flag_features):
|
||||
def __cinit__(self, lexemes, string_features, flag_features):
|
||||
self._mem = Pool()
|
||||
self._flag_features = flag_features
|
||||
self._string_features = string_features
|
||||
self._dict = PreshMap(2 ** 20)
|
||||
self.size = 0
|
||||
cdef String string
|
||||
for uni_string in words:
|
||||
prob = probs.get(uni_string, 0.0)
|
||||
cluster = clusters.get(uni_string, 0.0)
|
||||
cases = case_stats.get(uni_string, {})
|
||||
tags = tag_stats.get(uni_string, {})
|
||||
views = [string_view(uni_string, prob, cluster, cases, tags)
|
||||
for string_view in self._string_features]
|
||||
flags = set()
|
||||
for i, flag_feature in enumerate(self._flag_features):
|
||||
if flag_feature(uni_string, prob, cluster, cases, tags):
|
||||
flags.add(i)
|
||||
lexeme = lexeme_init(self._mem, self.size, uni_string, prob, cluster, views, flags)
|
||||
string_from_unicode(&string, uni_string)
|
||||
cdef dict lexeme_dict
|
||||
cdef LexemeC* lexeme
|
||||
for lexeme_dict in lexemes:
|
||||
string_from_unicode(&string, lexeme_dict['string'])
|
||||
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
||||
lexeme.views = <char**>self._mem.alloc(len(string_features), sizeof(char*))
|
||||
lexeme_unpack(lexeme, lexeme_dict)
|
||||
self._dict.set(string.key, lexeme)
|
||||
self.size += 1
|
||||
|
||||
|
|
|
@ -22,3 +22,4 @@ cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
|
|||
|
||||
|
||||
cdef dict lexeme_pack(LexemeC* lexeme)
|
||||
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
|
||||
|
|
|
@ -16,18 +16,16 @@ def read_lang_data(name):
|
|||
tokenization = read_tokenization(data_dir)
|
||||
prefix = read_prefix(data_dir)
|
||||
suffix = read_suffix(data_dir)
|
||||
words = load_resource(data_dir, 'words')
|
||||
probs = load_resource(data_dir, 'probs')
|
||||
clusters = load_resource(data_dir, 'clusters')
|
||||
case_stats = load_resource(data_dir, 'case_stats')
|
||||
tag_stats = load_resource(data_dir, 'tag_stats')
|
||||
return tokenization, prefix, suffix, words, probs, clusters, case_stats, tag_stats
|
||||
|
||||
lex_loc = path.join(data_dir, 'lexemes.json')
|
||||
if path.exists(lex_loc):
|
||||
with open(lex_loc) as file_:
|
||||
lexemes = ujson.load(file_)
|
||||
else:
|
||||
lexemes = []
|
||||
return tokenization, prefix, suffix, lexemes
|
||||
|
||||
|
||||
def load_resource(data_dir, name):
|
||||
loc = path.join(data_dir, name + '.json')
|
||||
return json.load(loc) if path.exists(loc) else {}
|
||||
|
||||
def read_prefix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'prefix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
|
|
Loading…
Reference in New Issue
Block a user