* Use PointerHash instead of locally provided _hashing module

This commit is contained in:
Matthew Honnibal 2014-09-25 18:22:52 +02:00
parent ed446c67ad
commit b15619e170
4 changed files with 26 additions and 96 deletions

View File

@ -46,7 +46,6 @@ else:
exts = [ exts = [
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes), Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
Extension("spacy.word", ["spacy/word.pyx"], language="c++", Extension("spacy.word", ["spacy/word.pyx"], language="c++",
include_dirs=includes), include_dirs=includes),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++",

View File

@ -42,8 +42,6 @@ from libc.stdint cimport uint64_t
cimport lang cimport lang
from spacy.lexeme cimport lexeme_check_flag from spacy.lexeme cimport lexeme_check_flag
from spacy.lexeme cimport lexeme_string_view from spacy.lexeme cimport lexeme_string_view
from spacy._hashing cimport PointerHash
from spacy import orth from spacy import orth

View File

@ -3,7 +3,7 @@ from libc.stdint cimport uint64_t
from spacy.word cimport Lexeme from spacy.word cimport Lexeme
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
from spacy.lexeme cimport LexemeC from spacy.lexeme cimport LexemeC
from spacy._hashing cimport PointerHash from trustyc.maps cimport PointerMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -30,7 +30,7 @@ cdef class Lexicon:
cpdef Lexeme lookup(self, unicode string) cpdef Lexeme lookup(self, unicode string)
cdef LexemeC* get(self, String* s) except NULL cdef LexemeC* get(self, String* s) except NULL
cdef PointerHash _dict cdef PointerMap _dict
cdef list _string_features cdef list _string_features
cdef list _flag_features cdef list _flag_features
@ -39,10 +39,13 @@ cdef class Lexicon:
cdef class Language: cdef class Language:
cdef Pool _mem cdef Pool _mem
cdef unicode name cdef unicode name
cdef PointerHash cache cdef PointerMap cache
cdef PointerHash specials cdef PointerMap specials
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cdef object prefix_re
cdef object suffix_re
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text) cpdef Lexeme lookup(self, unicode text)

View File

@ -11,6 +11,7 @@ from __future__ import unicode_literals
import json import json
import random import random
from os import path from os import path
import re
from .util import read_lang_data from .util import read_lang_data
from spacy.tokens import Tokens from spacy.tokens import Tokens
@ -25,7 +26,7 @@ from cython.operator cimport preincrement as preinc
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from spacy._hashing cimport PointerHash from trustyc.maps cimport PointerMap
from spacy import orth from spacy import orth
from spacy import util from spacy import util
@ -129,10 +130,12 @@ cdef class Language:
def __cinit__(self, name, user_string_features, user_flag_features): def __cinit__(self, name, user_string_features, user_flag_features):
self.name = name self.name = name
self._mem = Pool() self._mem = Pool()
self.cache = PointerHash(2 ** 25) self.cache = PointerMap(2 ** 25)
self.specials = PointerHash(2 ** 16) self.specials = PointerMap(2 ** 16)
lang_data = util.read_lang_data(name) lang_data = util.read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data
self.prefix_re = re.compile(prefix)
self.suffix_re = re.compile(suffix)
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
STRING_VIEW_FUNCS + user_string_features, STRING_VIEW_FUNCS + user_string_features,
FLAG_FUNCS + user_flag_features) FLAG_FUNCS + user_flag_features)
@ -302,93 +305,20 @@ cdef class Language:
self.cache.set(key, lexemes) self.cache.set(key, lexemes)
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
cdef Py_UNICODE c0 = chars[0] cdef unicode string = chars[:length]
cdef Py_UNICODE c1 = chars[1] match = self.prefix_re.search(string)
if c0 == ",": if match is None:
return 1
elif c0 == '"':
return 1
elif c0 == "(":
return 1
elif c0 == "[":
return 1
elif c0 == "{":
return 1
elif c0 == "*":
return 1
elif c0 == "<":
return 1
elif c0 == "$":
return 1
elif c0 == "£":
return 1
elif c0 == "":
return 1
elif c0 == "\u201c":
return 1
elif c0 == "'":
return 1
elif c0 == "`":
if c1 == "`":
return 2
else:
return 1
else:
return 0 return 0
else:
return match.end() - match.start()
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length): cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
cdef Py_UNICODE c0 = chars[length - 1] cdef unicode string = chars[:length]
cdef Py_UNICODE c1 = chars[length - 2] if length >= 2 else 0 match = self.suffix_re.search(string)
cdef Py_UNICODE c2 = chars[length - 3] if length >= 3 else 0 if match is None:
if c0 == ",":
return 1
elif c0 == '"':
return 1
elif c0 == ')':
return 1
elif c0 == ']':
return 1
elif c0 == '}':
return 1
elif c0 == '*':
return 1
elif c0 == '!':
return 1
elif c0 == '?':
return 1
elif c0 == '%':
return 1
elif c0 == '$':
return 1
elif c0 == '>':
return 1
elif c0 == ':':
return 1
elif c0 == "'":
return 1
elif c0 == u'\u201d':
return 1
elif c0 == "s":
if c1 == "'":
return 2
else:
return 0
elif c0 == "S":
if c1 == "'":
return 2
else:
return 0
elif c0 == ".":
if c1 == ".":
if c2 == ".":
return 3
else:
return 2
else:
return 1
else:
return 0 return 0
else:
return match.end() - match.start()
def _load_special_tokenization(self, token_rules): def _load_special_tokenization(self, token_rules):
'''Load special-case tokenization rules. '''Load special-case tokenization rules.
@ -422,7 +352,7 @@ cdef class Lexicon:
self._mem = Pool() self._mem = Pool()
self._flag_features = flag_features self._flag_features = flag_features
self._string_features = string_features self._string_features = string_features
self._dict = PointerHash(2 ** 20) self._dict = PointerMap(2 ** 20)
self.size = 0 self.size = 0
cdef String string cdef String string
for uni_string in words: for uni_string in words: