mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Working refactor, with updated data model for Lexemes
This commit is contained in:
parent
3379d7a571
commit
5fddb8d165
28
spacy/en.pyx
28
spacy/en.pyx
|
@ -66,31 +66,3 @@ cpdef unicode unhash(StringHash hash_value):
|
||||||
return EN.unhash(hash_value)
|
return EN.unhash(hash_value)
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_upper(size_t lex_id):
|
|
||||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
|
||||||
stores whether the lowered version of the string hashed by `lex' is found
|
|
||||||
in all-upper case frequently in a large sample of text. Users are free
|
|
||||||
to load different data, by default we use a sample from Wikipedia, with
|
|
||||||
a threshold of 0.95, picked to maximize mutual information for POS tagging.
|
|
||||||
|
|
||||||
>>> is_oft_upper(lookup(u'abc'))
|
|
||||||
True
|
|
||||||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
|
||||||
True
|
|
||||||
'''
|
|
||||||
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_title(size_t lex_id):
|
|
||||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
|
||||||
stores whether the lowered version of the string hashed by `lex' is found
|
|
||||||
title-cased frequently in a large sample of text. Users are free
|
|
||||||
to load different data, by default we use a sample from Wikipedia, with
|
|
||||||
a threshold of 0.3, picked to maximize mutual information for POS tagging.
|
|
||||||
|
|
||||||
>>> is_oft_title(lookup(u'marcus'))
|
|
||||||
True
|
|
||||||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
|
||||||
True
|
|
||||||
'''
|
|
||||||
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
|
|
||||||
|
|
|
@ -32,12 +32,9 @@ cdef enum DistFlag:
|
||||||
|
|
||||||
|
|
||||||
cdef struct Orthography:
|
cdef struct Orthography:
|
||||||
StringHash last3
|
|
||||||
StringHash shape
|
StringHash shape
|
||||||
StringHash norm
|
StringHash norm
|
||||||
|
StringHash last3
|
||||||
size_t length
|
|
||||||
Py_UNICODE first
|
|
||||||
Bits8 flags
|
Bits8 flags
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,12 +46,17 @@ cdef struct Distribution:
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
StringHash lex # Hash of the word
|
char* string
|
||||||
Orthography* orth # Extra orthographic views
|
size_t length
|
||||||
Distribution* dist # Distribution info
|
StringHash lex
|
||||||
|
Orthography orth # Extra orthographic views
|
||||||
|
Distribution dist # Distribution info
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)
|
cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
|
||||||
|
Orthography(0, 0, 0, 0),
|
||||||
|
Distribution(0.0, 0, 0, 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
cdef enum StringAttr:
|
cdef enum StringAttr:
|
||||||
|
@ -68,13 +70,11 @@ cdef enum StringAttr:
|
||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
||||||
|
|
||||||
cpdef StringHash lex_of(size_t lex_id) except 0
|
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||||
|
|
||||||
cpdef StringHash norm_of(size_t lex_id) except 0
|
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||||
cpdef StringHash shape_of(size_t lex_id) except 0
|
cpdef StringHash shape_of(size_t lex_id) except 0
|
||||||
cpdef StringHash last3_of(size_t lex_id) except 0
|
cpdef StringHash last3_of(size_t lex_id) except 0
|
||||||
|
|
||||||
cpdef size_t length_of(size_t lex_id) except *
|
cpdef size_t length_of(size_t lex_id) except *
|
||||||
cpdef Py_UNICODE first_of(size_t lex_id) except *
|
|
||||||
|
|
||||||
cpdef double prob_of(size_t lex_id) except 0
|
cpdef double prob_of(size_t lex_id) except 0
|
||||||
cpdef ClusterID cluster_of(size_t lex_id) except 0
|
cpdef ClusterID cluster_of(size_t lex_id) except 0
|
||||||
|
|
|
@ -72,7 +72,7 @@ cpdef StringHash last3_of(size_t lex_id) except 0:
|
||||||
return (<Lexeme*>lex_id).orth.last3
|
return (<Lexeme*>lex_id).orth.last3
|
||||||
|
|
||||||
|
|
||||||
cpdef ClusterID cluster_of(size_t lex_id):
|
cpdef ClusterID cluster_of(size_t lex_id) except 0:
|
||||||
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
||||||
gives an integer representation of the cluster ID of the word,
|
gives an integer representation of the cluster ID of the word,
|
||||||
which should be understood as a binary address:
|
which should be understood as a binary address:
|
||||||
|
@ -99,21 +99,17 @@ cpdef Py_UNICODE first_of(size_t lex_id):
|
||||||
>>> unhash(first_of(lex_id))
|
>>> unhash(first_of(lex_id))
|
||||||
u'H'
|
u'H'
|
||||||
'''
|
'''
|
||||||
if (<Lexeme*>lex_id).orth == NULL:
|
|
||||||
return 0
|
|
||||||
return (<Lexeme*>lex_id).orth.first
|
return (<Lexeme*>lex_id).orth.first
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash length_of(size_t lex_id):
|
cpdef size_t length_of(size_t lex_id) except *:
|
||||||
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores
|
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores
|
||||||
the length of the string hashed by lex_of.'''
|
the length of the string hashed by lex_of.'''
|
||||||
cdef Lexeme* word = <Lexeme*>lex_id
|
cdef Lexeme* word = <Lexeme*>lex_id
|
||||||
if (<Lexeme*>lex_id).orth == NULL:
|
return word.length
|
||||||
return 0
|
|
||||||
return (<Lexeme*>lex_id).orth.length
|
|
||||||
|
|
||||||
|
|
||||||
cpdef double prob_of(size_t lex_id):
|
cpdef double prob_of(size_t lex_id) except 0:
|
||||||
'''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
|
'''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
|
||||||
the smoothed unigram log probability of the word, as estimated from a large
|
the smoothed unigram log probability of the word, as estimated from a large
|
||||||
text corpus. By default, probabilities are based on counts from Gigaword,
|
text corpus. By default, probabilities are based on counts from Gigaword,
|
||||||
|
@ -126,9 +122,38 @@ cpdef double prob_of(size_t lex_id):
|
||||||
return (<Lexeme*>lex_id).dist.prob
|
return (<Lexeme*>lex_id).dist.prob
|
||||||
|
|
||||||
|
|
||||||
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
|
cpdef bint is_oft_upper(size_t lex_id):
|
||||||
|
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||||
|
stores whether the lowered version of the string hashed by `lex' is found
|
||||||
|
in all-upper case frequently in a large sample of text. Users are free
|
||||||
|
to load different data, by default we use a sample from Wikipedia, with
|
||||||
|
a threshold of 0.95, picked to maximize mutual information for POS tagging.
|
||||||
|
|
||||||
|
>>> is_oft_upper(lookup(u'abc'))
|
||||||
|
True
|
||||||
|
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||||
|
True
|
||||||
|
'''
|
||||||
|
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
|
||||||
|
|
||||||
|
|
||||||
|
cpdef bint is_oft_title(size_t lex_id):
|
||||||
|
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||||
|
stores whether the lowered version of the string hashed by `lex' is found
|
||||||
|
title-cased frequently in a large sample of text. Users are free
|
||||||
|
to load different data, by default we use a sample from Wikipedia, with
|
||||||
|
a threshold of 0.3, picked to maximize mutual information for POS tagging.
|
||||||
|
|
||||||
|
>>> is_oft_title(lookup(u'marcus'))
|
||||||
|
True
|
||||||
|
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||||
|
True
|
||||||
|
'''
|
||||||
|
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
|
||||||
|
|
||||||
|
cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
|
||||||
return (<Lexeme*>lex_id).orth.flags & (1 << flag)
|
return (<Lexeme*>lex_id).orth.flags & (1 << flag)
|
||||||
|
|
||||||
|
|
||||||
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
|
cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
|
||||||
return (<Lexeme*>lex_id).dist.flags & (1 << flag)
|
return (<Lexeme*>lex_id).dist.flags & (1 << flag)
|
||||||
|
|
|
@ -21,7 +21,6 @@ ctypedef int ClusterID
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport Distribution
|
from spacy.lexeme cimport Distribution
|
||||||
from spacy.lexeme cimport Orthography
|
from spacy.lexeme cimport Orthography
|
||||||
from spacy._hashing cimport WordTree
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
@ -37,8 +36,6 @@ cdef class Language:
|
||||||
|
|
||||||
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
|
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
|
||||||
cdef Lexeme* new_lexeme(self, unicode lex) except NULL
|
cdef Lexeme* new_lexeme(self, unicode lex) except NULL
|
||||||
cdef Orthography* new_orth(self, unicode lex) except NULL
|
|
||||||
cdef Distribution* new_dist(self, unicode lex) except NULL
|
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hashed)
|
cdef unicode unhash(self, StringHash hashed)
|
||||||
|
|
||||||
|
|
|
@ -13,15 +13,19 @@ from spacy.string_tools cimport substr
|
||||||
from . import util
|
from . import util
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
DIST_FLAGS = {}
|
||||||
|
TAGS = {}
|
||||||
|
|
||||||
def get_normalized(unicode lex, size_t length):
|
|
||||||
|
def get_normalized(unicode lex):
|
||||||
if lex.isalpha() and lex.islower():
|
if lex.isalpha() and lex.islower():
|
||||||
return lex
|
return lex
|
||||||
else:
|
else:
|
||||||
return get_word_shape(lex, length)
|
return get_word_shape(lex)
|
||||||
|
|
||||||
|
|
||||||
def get_word_shape(unicode lex, length):
|
def get_word_shape(unicode lex):
|
||||||
|
cdef size_t length = len(lex)
|
||||||
shape = ""
|
shape = ""
|
||||||
last = ""
|
last = ""
|
||||||
shape_char = ""
|
shape_char = ""
|
||||||
|
@ -47,7 +51,7 @@ def get_word_shape(unicode lex, length):
|
||||||
return shape
|
return shape
|
||||||
|
|
||||||
|
|
||||||
def set_orth_flags(lex, length):
|
def set_orth_flags(lex):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
@ -60,7 +64,7 @@ cdef class Language:
|
||||||
self.chunks.set_empty_key(0)
|
self.chunks.set_empty_key(0)
|
||||||
self.vocab.set_empty_key(0)
|
self.vocab.set_empty_key(0)
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
self.load_dist_info(util.read_dist_info(name))
|
#self.load_dist_info(util.read_dist_info(name))
|
||||||
|
|
||||||
cdef Tokens tokenize(self, unicode string):
|
cdef Tokens tokenize(self, unicode string):
|
||||||
cdef Lexeme** chunk
|
cdef Lexeme** chunk
|
||||||
|
@ -106,39 +110,25 @@ cdef class Language:
|
||||||
|
|
||||||
cdef Lexeme* new_lexeme(self, unicode string) except NULL:
|
cdef Lexeme* new_lexeme(self, unicode string) except NULL:
|
||||||
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
||||||
|
cdef bytes byte_string = string.encode('utf8')
|
||||||
|
word.string = <char*>byte_string
|
||||||
|
word.length = len(byte_string)
|
||||||
|
word.orth.flags = set_orth_flags(string)
|
||||||
|
cdef unicode norm = get_normalized(string)
|
||||||
|
cdef unicode shape = get_word_shape(string)
|
||||||
|
cdef unicode last3 = string[-3:]
|
||||||
word.lex = hash(string)
|
word.lex = hash(string)
|
||||||
|
word.orth.norm = hash(norm)
|
||||||
|
word.orth.shape = hash(shape)
|
||||||
|
word.orth.last3 = hash(last3)
|
||||||
self.bacov[word.lex] = string
|
self.bacov[word.lex] = string
|
||||||
word.orth = self.new_orth(string)
|
self.bacov[word.orth.norm] = norm
|
||||||
|
self.bacov[word.orth.shape] = shape
|
||||||
|
self.bacov[word.orth.last3] = last3
|
||||||
|
|
||||||
word.dist = <Distribution*>calloc(1, sizeof(Distribution))
|
self.vocab[hash(string)] = <size_t>word
|
||||||
self.vocab[word.lex] = <size_t>word
|
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cdef Orthography* new_orth(self, unicode lex) except NULL:
|
|
||||||
cdef unicode last3
|
|
||||||
cdef unicode norm
|
|
||||||
cdef unicode shape
|
|
||||||
cdef int length
|
|
||||||
|
|
||||||
length = len(lex)
|
|
||||||
orth = <Orthography*>calloc(1, sizeof(Orthography))
|
|
||||||
orth.first = lex[0]
|
|
||||||
|
|
||||||
orth.length = length
|
|
||||||
orth.flags = set_orth_flags(lex, orth.length)
|
|
||||||
orth.norm = hash(lex)
|
|
||||||
last3 = substr(lex, length - 3, length, length)
|
|
||||||
orth.last3 = hash(last3)
|
|
||||||
norm = get_normalized(lex, length)
|
|
||||||
orth.norm = hash(norm)
|
|
||||||
shape = get_word_shape(lex, length)
|
|
||||||
orth.shape = hash(shape)
|
|
||||||
|
|
||||||
self.bacov[orth.last3] = last3
|
|
||||||
self.bacov[orth.norm] = norm
|
|
||||||
self.bacov[orth.shape] = shape
|
|
||||||
return orth
|
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hash_value):
|
cdef unicode unhash(self, StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
return self.bacov[hash_value]
|
return self.bacov[hash_value]
|
||||||
|
@ -167,12 +157,12 @@ cdef class Language:
|
||||||
cdef Lexeme* w
|
cdef Lexeme* w
|
||||||
for string, word_dist in dist_info.items():
|
for string, word_dist in dist_info.items():
|
||||||
w = self.lookup(string)
|
w = self.lookup(string)
|
||||||
w.prob = word_dist.prob
|
w.dist.prob = word_dist.prob
|
||||||
w.cluster = word_dist.cluster
|
w.dist.cluster = word_dist.cluster
|
||||||
for flag in word_dist.flags:
|
for flag in word_dist.flags:
|
||||||
w.flags |= lexeme.DIST_FLAGS[flag]
|
w.dist.flags |= DIST_FLAGS[flag]
|
||||||
for tag in word_dist.tagdict:
|
for tag in word_dist.tagdict:
|
||||||
w.tagdict |= lexeme.TAGS[tag]
|
w.dist.tagdict |= TAGS[tag]
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user