mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 08:42:28 +03:00
* Tighten interfaces
This commit is contained in:
parent
ea85bf3a0a
commit
ea8f1e7053
|
@ -50,4 +50,4 @@ cdef class English(Language):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
EN = English('en', [], [])
|
EN = English('en')
|
||||||
|
|
|
@ -27,7 +27,6 @@ cdef class Lexicon:
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
|
||||||
cdef Lexeme* get(self, String* s) except NULL
|
cdef Lexeme* get(self, String* s) except NULL
|
||||||
|
|
||||||
cdef PreshMap _dict
|
cdef PreshMap _dict
|
||||||
|
|
|
@ -1,11 +1,5 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
"""Common classes and utilities across languages.
|
|
||||||
|
|
||||||
Provides the main implementation for the spacy tokenizer. Specific languages
|
|
||||||
subclass the Language class, over-writing the tokenization rules as necessary.
|
|
||||||
Special-case tokenization rules are read from data/<lang>/tokenization .
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
@ -24,27 +18,22 @@ from preshed.maps cimport PreshMap
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .lexeme cimport init as lexeme_init
|
from .lexeme cimport init as lexeme_init
|
||||||
|
|
||||||
from . import orth
|
|
||||||
from . import util
|
from . import util
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from .tokens import Tokens
|
from .tokens import Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
"""Base class for language-specific tokenizers.
|
|
||||||
|
|
||||||
The language's name is used to look up default data-files, found in data/<name.
|
|
||||||
"""
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap(2 ** 25)
|
self._cache = PreshMap(2 ** 25)
|
||||||
self._specials = PreshMap(2 ** 16)
|
self._specials = PreshMap(2 ** 16)
|
||||||
rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
|
rules, prefix, suffix, infix = util.read_lang_data(name)
|
||||||
self._prefix_re = re.compile(prefix)
|
self._prefix_re = re.compile(prefix)
|
||||||
self._suffix_re = re.compile(suffix)
|
self._suffix_re = re.compile(suffix)
|
||||||
self._infix_re = re.compile(infix)
|
self._infix_re = re.compile(infix)
|
||||||
self.lexicon = Lexicon(lexemes)
|
self.lexicon = Lexicon()
|
||||||
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
|
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
|
||||||
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
||||||
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
||||||
|
@ -231,26 +220,11 @@ cdef class Language:
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
def __init__(self, lexemes):
|
def __init__(self):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._dict = PreshMap(2 ** 20)
|
self._dict = PreshMap(2 ** 20)
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.size = 1
|
self.size = 1
|
||||||
cdef String string
|
|
||||||
cdef Lexeme* lexeme
|
|
||||||
for py_string, lexeme_dict in lexemes.iteritems():
|
|
||||||
string_from_unicode(&string, py_string)
|
|
||||||
lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
|
||||||
lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.strings,
|
|
||||||
lexeme_dict)
|
|
||||||
self._dict.set(string.key, lexeme)
|
|
||||||
self.size += 1
|
|
||||||
|
|
||||||
def set(self, unicode py_string, dict lexeme_dict):
|
|
||||||
cdef String string
|
|
||||||
string_from_unicode(&string, py_string)
|
|
||||||
cdef Lexeme* lex = self.get(&string)
|
|
||||||
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, lexeme_dict)
|
|
||||||
|
|
||||||
cdef Lexeme* get(self, String* string) except NULL:
|
cdef Lexeme* get(self, String* string) except NULL:
|
||||||
cdef Lexeme* lex
|
cdef Lexeme* lex
|
||||||
|
@ -263,20 +237,18 @@ cdef class Lexicon:
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode uni_string):
|
def __getitem__(self, unicode uni_string):
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
|
||||||
|
|
||||||
Args
|
|
||||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
|
||||||
"""
|
|
||||||
cdef String string
|
cdef String string
|
||||||
string_from_unicode(&string, uni_string)
|
string_from_unicode(&string, uni_string)
|
||||||
cdef Lexeme* lexeme = self.get(&string)
|
cdef Lexeme* lexeme = self.get(&string)
|
||||||
return lexeme[0]
|
return lexeme[0]
|
||||||
|
|
||||||
|
def __setitem__(self, unicode uni_string, dict props):
|
||||||
|
cdef String s
|
||||||
|
string_from_unicode(&s, uni_string)
|
||||||
|
cdef Lexeme* lex = self.get(&s)
|
||||||
|
lex[0] = lexeme_init(s.chars[:s.n], s.key, self.strings, props)
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
if path.exists(loc):
|
if path.exists(loc):
|
||||||
assert not path.isdir(loc)
|
assert not path.isdir(loc)
|
||||||
|
@ -316,7 +288,6 @@ cdef class Lexicon:
|
||||||
break
|
break
|
||||||
self._dict.set(key, lexeme)
|
self._dict.set(key, lexeme)
|
||||||
i += 1
|
i += 1
|
||||||
print "Load %d lexemes" % i
|
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,14 +17,7 @@ def read_lang_data(name):
|
||||||
prefix = read_prefix(data_dir)
|
prefix = read_prefix(data_dir)
|
||||||
suffix = read_suffix(data_dir)
|
suffix = read_suffix(data_dir)
|
||||||
infix = read_infix(data_dir)
|
infix = read_infix(data_dir)
|
||||||
|
return tokenization, prefix, suffix, infix
|
||||||
lex_loc = path.join(data_dir, 'lexemes.json')
|
|
||||||
if path.exists(lex_loc):
|
|
||||||
with open(lex_loc) as file_:
|
|
||||||
lexemes = ujson.load(file_)
|
|
||||||
else:
|
|
||||||
lexemes = {}
|
|
||||||
return tokenization, prefix, suffix, infix, lexemes
|
|
||||||
|
|
||||||
|
|
||||||
def read_prefix(data_dir):
|
def read_prefix(data_dir):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user