mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
* Restore happax. commit uncommited work
This commit is contained in:
parent
6319ff0f22
commit
365a2af756
1
setup.py
1
setup.py
|
@ -48,6 +48,7 @@ exts = [
|
|||
Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from .lexeme import lex_of
|
||||
from .lexeme import sic_of
|
||||
from .lexeme import length_of
|
||||
|
||||
from .tokens import Tokens
|
||||
|
||||
|
@ -10,28 +11,6 @@ LEX = 1
|
|||
NORM = 2
|
||||
SHAPE = 3
|
||||
LAST3 = 4
|
||||
LENGTH = 5
|
||||
|
||||
__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
|
||||
|
||||
|
||||
"""
|
||||
from .tokens import ids_from_string
|
||||
from .tokens import group_by
|
||||
|
||||
from .lex import sic_of
|
||||
from .lex import lex_of
|
||||
from .lex import normed_of
|
||||
from .lex import first_of
|
||||
from .lex import last_three_of
|
||||
|
||||
from .lex import cluster_of
|
||||
from .lex import prob_of
|
||||
|
||||
from .lex import is_oft_upper
|
||||
from .lex import is_oft_title
|
||||
|
||||
from .lex import can_noun
|
||||
from .lex import can_verb
|
||||
from .lex import can_adj
|
||||
from .lex import can_adv
|
||||
"""
|
||||
__all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH]
|
||||
|
|
|
@ -51,5 +51,3 @@ cdef class FixedTable:
|
|||
@cython.cdivision
|
||||
cdef inline size_t _find(uint64_t key, size_t size) nogil:
|
||||
return key % size
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ from libcpp.vector cimport vector
|
|||
from libc.stdint cimport uint64_t
|
||||
|
||||
from sparsehash.dense_hash_map cimport dense_hash_map
|
||||
from _hashing cimport FixedTable
|
||||
|
||||
# Circular import problems here
|
||||
ctypedef size_t Lexeme_addr
|
||||
|
@ -24,6 +25,7 @@ from spacy.lexeme cimport Orthography
|
|||
|
||||
cdef class Language:
|
||||
cdef object name
|
||||
cdef FixedTable happax
|
||||
cdef Vocab* vocab
|
||||
cdef Vocab* distri
|
||||
cdef Vocab* ortho
|
||||
|
@ -39,3 +41,5 @@ cdef class Language:
|
|||
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
|
||||
int split, size_t length)
|
||||
cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
|
||||
|
||||
cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr)
|
||||
|
|
|
@ -50,15 +50,18 @@ def get_word_shape(lex, length):
|
|||
return shape
|
||||
|
||||
|
||||
|
||||
def set_orth_flags(lex, length):
|
||||
return 0
|
||||
|
||||
|
||||
DEF MAX_HAPPAX = 1048576
|
||||
|
||||
|
||||
cdef class Language:
|
||||
def __cinit__(self, name):
|
||||
self.name = name
|
||||
self.bacov = {}
|
||||
self.happax = FixedTable(MAX_HAPPAX)
|
||||
self.vocab = new Vocab()
|
||||
self.ortho = new Vocab()
|
||||
self.distri = new Vocab()
|
||||
|
@ -81,6 +84,7 @@ cdef class Language:
|
|||
length = len(token_string)
|
||||
hashed = self.hash_string(token_string, length)
|
||||
word.tail = self._add(hashed, lex, 0, len(lex))
|
||||
self._happax_to_vocab(hashed, <Lexeme_addr>word.tail)
|
||||
word = word.tail
|
||||
|
||||
def load_clusters(self):
|
||||
|
@ -122,14 +126,27 @@ cdef class Language:
|
|||
# First, check words seen 2+ times
|
||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
||||
if word_ptr == NULL:
|
||||
start = self.find_split(string, length) if start == -1 else start
|
||||
word_ptr = self._add(hashed, string, start, length)
|
||||
# Now check words seen exactly once
|
||||
word_ptr = <Lexeme*>self.happax.get(hashed)
|
||||
if word_ptr == NULL:
|
||||
start = self.find_split(string, length) if start == -1 else start
|
||||
word_ptr = self._add(hashed, string, start, length)
|
||||
else:
|
||||
# Second time word seen, move to vocab
|
||||
self._happax_to_vocab(hashed, <Lexeme_addr>word_ptr)
|
||||
return <Lexeme_addr>word_ptr
|
||||
|
||||
cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr):
|
||||
self.vocab[0][hashed] = word_ptr
|
||||
self.happax.erase(hashed)
|
||||
|
||||
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
||||
cdef size_t i
|
||||
word = self.init_lexeme(string, hashed, split, length)
|
||||
self.vocab[0][hashed] = <Lexeme_addr>word
|
||||
if self.happax.keys[hashed % self.happax.size] != 0:
|
||||
self._happax_to_vocab(self.happax.keys[hashed % self.happax.size],
|
||||
self.happax.values[hashed % self.happax.size])
|
||||
self.happax.insert(hashed, <size_t>word)
|
||||
self.bacov[hashed] = string
|
||||
return word
|
||||
|
||||
|
@ -194,6 +211,7 @@ cdef class Language:
|
|||
# Now recurse, and deal with the tail
|
||||
if tail_string:
|
||||
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
|
||||
self._happax_to_vocab(word.tail.sic, <Lexeme_addr>word.tail)
|
||||
return word
|
||||
|
||||
cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
|
||||
|
|
|
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
|
|||
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.lexeme cimport attr_of, norm_of, shape_of
|
||||
from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user