mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Use PointerHash instead of locally provided _hashing module
This commit is contained in:
parent
ed446c67ad
commit
b15619e170
1
setup.py
1
setup.py
|
@ -46,7 +46,6 @@ else:
|
||||||
|
|
||||||
exts = [
|
exts = [
|
||||||
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
|
|
||||||
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
||||||
include_dirs=includes),
|
include_dirs=includes),
|
||||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++",
|
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++",
|
||||||
|
|
|
@ -42,8 +42,6 @@ from libc.stdint cimport uint64_t
|
||||||
cimport lang
|
cimport lang
|
||||||
from spacy.lexeme cimport lexeme_check_flag
|
from spacy.lexeme cimport lexeme_check_flag
|
||||||
from spacy.lexeme cimport lexeme_string_view
|
from spacy.lexeme cimport lexeme_string_view
|
||||||
from spacy._hashing cimport PointerHash
|
|
||||||
|
|
||||||
|
|
||||||
from spacy import orth
|
from spacy import orth
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ from libc.stdint cimport uint64_t
|
||||||
from spacy.word cimport Lexeme
|
from spacy.word cimport Lexeme
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
from spacy.lexeme cimport LexemeC
|
from spacy.lexeme cimport LexemeC
|
||||||
from spacy._hashing cimport PointerHash
|
from trustyc.maps cimport PointerMap
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ cdef class Lexicon:
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
cdef LexemeC* get(self, String* s) except NULL
|
cdef LexemeC* get(self, String* s) except NULL
|
||||||
|
|
||||||
cdef PointerHash _dict
|
cdef PointerMap _dict
|
||||||
|
|
||||||
cdef list _string_features
|
cdef list _string_features
|
||||||
cdef list _flag_features
|
cdef list _flag_features
|
||||||
|
@ -39,10 +39,13 @@ cdef class Lexicon:
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef Pool _mem
|
cdef Pool _mem
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
cdef PointerHash cache
|
cdef PointerMap cache
|
||||||
cdef PointerHash specials
|
cdef PointerMap specials
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
|
|
||||||
|
cdef object prefix_re
|
||||||
|
cdef object suffix_re
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
|
|
104
spacy/lang.pyx
104
spacy/lang.pyx
|
@ -11,6 +11,7 @@ from __future__ import unicode_literals
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from os import path
|
from os import path
|
||||||
|
import re
|
||||||
|
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from spacy.tokens import Tokens
|
from spacy.tokens import Tokens
|
||||||
|
@ -25,7 +26,7 @@ from cython.operator cimport preincrement as preinc
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
|
|
||||||
|
|
||||||
from spacy._hashing cimport PointerHash
|
from trustyc.maps cimport PointerMap
|
||||||
from spacy import orth
|
from spacy import orth
|
||||||
from spacy import util
|
from spacy import util
|
||||||
|
|
||||||
|
@ -129,10 +130,12 @@ cdef class Language:
|
||||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||||
self.name = name
|
self.name = name
|
||||||
self._mem = Pool()
|
self._mem = Pool()
|
||||||
self.cache = PointerHash(2 ** 25)
|
self.cache = PointerMap(2 ** 25)
|
||||||
self.specials = PointerHash(2 ** 16)
|
self.specials = PointerMap(2 ** 16)
|
||||||
lang_data = util.read_lang_data(name)
|
lang_data = util.read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
|
self.prefix_re = re.compile(prefix)
|
||||||
|
self.suffix_re = re.compile(suffix)
|
||||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
STRING_VIEW_FUNCS + user_string_features,
|
STRING_VIEW_FUNCS + user_string_features,
|
||||||
FLAG_FUNCS + user_flag_features)
|
FLAG_FUNCS + user_flag_features)
|
||||||
|
@ -302,93 +305,20 @@ cdef class Language:
|
||||||
self.cache.set(key, lexemes)
|
self.cache.set(key, lexemes)
|
||||||
|
|
||||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
cdef Py_UNICODE c0 = chars[0]
|
cdef unicode string = chars[:length]
|
||||||
cdef Py_UNICODE c1 = chars[1]
|
match = self.prefix_re.search(string)
|
||||||
if c0 == ",":
|
if match is None:
|
||||||
return 1
|
|
||||||
elif c0 == '"':
|
|
||||||
return 1
|
|
||||||
elif c0 == "(":
|
|
||||||
return 1
|
|
||||||
elif c0 == "[":
|
|
||||||
return 1
|
|
||||||
elif c0 == "{":
|
|
||||||
return 1
|
|
||||||
elif c0 == "*":
|
|
||||||
return 1
|
|
||||||
elif c0 == "<":
|
|
||||||
return 1
|
|
||||||
elif c0 == "$":
|
|
||||||
return 1
|
|
||||||
elif c0 == "£":
|
|
||||||
return 1
|
|
||||||
elif c0 == "€":
|
|
||||||
return 1
|
|
||||||
elif c0 == "\u201c":
|
|
||||||
return 1
|
|
||||||
elif c0 == "'":
|
|
||||||
return 1
|
|
||||||
elif c0 == "`":
|
|
||||||
if c1 == "`":
|
|
||||||
return 2
|
|
||||||
else:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
return 0
|
||||||
|
else:
|
||||||
|
return match.end() - match.start()
|
||||||
|
|
||||||
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
|
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
|
||||||
cdef Py_UNICODE c0 = chars[length - 1]
|
cdef unicode string = chars[:length]
|
||||||
cdef Py_UNICODE c1 = chars[length - 2] if length >= 2 else 0
|
match = self.suffix_re.search(string)
|
||||||
cdef Py_UNICODE c2 = chars[length - 3] if length >= 3 else 0
|
if match is None:
|
||||||
|
|
||||||
if c0 == ",":
|
|
||||||
return 1
|
|
||||||
elif c0 == '"':
|
|
||||||
return 1
|
|
||||||
elif c0 == ')':
|
|
||||||
return 1
|
|
||||||
elif c0 == ']':
|
|
||||||
return 1
|
|
||||||
elif c0 == '}':
|
|
||||||
return 1
|
|
||||||
elif c0 == '*':
|
|
||||||
return 1
|
|
||||||
elif c0 == '!':
|
|
||||||
return 1
|
|
||||||
elif c0 == '?':
|
|
||||||
return 1
|
|
||||||
elif c0 == '%':
|
|
||||||
return 1
|
|
||||||
elif c0 == '$':
|
|
||||||
return 1
|
|
||||||
elif c0 == '>':
|
|
||||||
return 1
|
|
||||||
elif c0 == ':':
|
|
||||||
return 1
|
|
||||||
elif c0 == "'":
|
|
||||||
return 1
|
|
||||||
elif c0 == u'\u201d':
|
|
||||||
return 1
|
|
||||||
elif c0 == "s":
|
|
||||||
if c1 == "'":
|
|
||||||
return 2
|
|
||||||
else:
|
|
||||||
return 0
|
return 0
|
||||||
elif c0 == "S":
|
|
||||||
if c1 == "'":
|
|
||||||
return 2
|
|
||||||
else:
|
else:
|
||||||
return 0
|
return match.end() - match.start()
|
||||||
elif c0 == ".":
|
|
||||||
if c1 == ".":
|
|
||||||
if c2 == ".":
|
|
||||||
return 3
|
|
||||||
else:
|
|
||||||
return 2
|
|
||||||
else:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def _load_special_tokenization(self, token_rules):
|
def _load_special_tokenization(self, token_rules):
|
||||||
'''Load special-case tokenization rules.
|
'''Load special-case tokenization rules.
|
||||||
|
@ -422,7 +352,7 @@ cdef class Lexicon:
|
||||||
self._mem = Pool()
|
self._mem = Pool()
|
||||||
self._flag_features = flag_features
|
self._flag_features = flag_features
|
||||||
self._string_features = string_features
|
self._string_features = string_features
|
||||||
self._dict = PointerHash(2 ** 20)
|
self._dict = PointerMap(2 ** 20)
|
||||||
self.size = 0
|
self.size = 0
|
||||||
cdef String string
|
cdef String string
|
||||||
for uni_string in words:
|
for uni_string in words:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user