mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Reforming data model for lexemes
This commit is contained in:
parent
e091b6a241
commit
3379d7a571
30
spacy/en.pyx
30
spacy/en.pyx
|
@ -64,3 +64,33 @@ cpdef Lexeme_addr lookup(unicode string) except 0:
|
|||
|
||||
cpdef unicode unhash(StringHash hash_value):
|
||||
return EN.unhash(hash_value)
|
||||
|
||||
|
||||
cpdef bint is_oft_upper(size_t lex_id):
|
||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||
stores whether the lowered version of the string hashed by `lex' is found
|
||||
in all-upper case frequently in a large sample of text. Users are free
|
||||
to load different data, by default we use a sample from Wikipedia, with
|
||||
a threshold of 0.95, picked to maximize mutual information for POS tagging.
|
||||
|
||||
>>> is_oft_upper(lookup(u'abc'))
|
||||
True
|
||||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||
True
|
||||
'''
|
||||
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
|
||||
|
||||
|
||||
cpdef bint is_oft_title(size_t lex_id):
|
||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||
stores whether the lowered version of the string hashed by `lex' is found
|
||||
title-cased frequently in a large sample of text. Users are free
|
||||
to load different data, by default we use a sample from Wikipedia, with
|
||||
a threshold of 0.3, picked to maximize mutual information for POS tagging.
|
||||
|
||||
>>> is_oft_title(lookup(u'marcus'))
|
||||
True
|
||||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||
True
|
||||
'''
|
||||
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
|
||||
|
|
|
@ -9,13 +9,35 @@ ctypedef char Bits8
|
|||
ctypedef uint64_t Bits64
|
||||
|
||||
|
||||
cdef enum OrthFlag:
|
||||
IS_ALPHA
|
||||
IS_DIGIT
|
||||
IS_PUNCT
|
||||
IS_WHITE
|
||||
IS_LOWER
|
||||
IS_UPPER
|
||||
IS_TITLE
|
||||
IS_ASCII
|
||||
|
||||
|
||||
cdef enum DistFlag:
|
||||
OFT_UPPER
|
||||
OFT_TITLE
|
||||
DIST_FLAG3
|
||||
DIST_FLAG4
|
||||
DIST_FLAG5
|
||||
DIST_FLAG6
|
||||
DIST_FLAG7
|
||||
DIST_FLAG8
|
||||
|
||||
|
||||
cdef struct Orthography:
|
||||
StringHash last3
|
||||
StringHash shape
|
||||
StringHash norm
|
||||
|
||||
size_t length
|
||||
unsigned char first
|
||||
Py_UNICODE first
|
||||
Bits8 flags
|
||||
|
||||
|
||||
|
@ -27,15 +49,12 @@ cdef struct Distribution:
|
|||
|
||||
|
||||
cdef struct Lexeme:
|
||||
StringHash sic # Hash of the original string
|
||||
StringHash lex # Hash of the word, with punctuation and clitics split off
|
||||
|
||||
Distribution* dist # Distribution info, lazy loaded
|
||||
StringHash lex # Hash of the word
|
||||
Orthography* orth # Extra orthographic views
|
||||
#Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
||||
Distribution* dist # Distribution info
|
||||
|
||||
|
||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
||||
cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)
|
||||
|
||||
|
||||
cdef enum StringAttr:
|
||||
|
@ -49,7 +68,16 @@ cdef enum StringAttr:
|
|||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
||||
|
||||
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||
|
||||
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||
cpdef StringHash shape_of(size_t lex_id) except 0
|
||||
cpdef StringHash last3_of(size_t lex_id) except 0
|
||||
cpdef StringHash length_of(size_t lex_id)
|
||||
|
||||
cpdef size_t length_of(size_t lex_id) except *
|
||||
cpdef Py_UNICODE first_of(size_t lex_id) except *
|
||||
|
||||
cpdef double prob_of(size_t lex_id) except 0
|
||||
cpdef ClusterID cluster_of(size_t lex_id) except 0
|
||||
|
||||
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
|
||||
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
|
||||
|
|
|
@ -13,13 +13,6 @@ from libcpp.vector cimport vector
|
|||
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
# Reiterate the enum, for python
|
||||
#SIC = StringAttr.sic
|
||||
#LEX = StringAttr.lex
|
||||
#NORM = StringAttr.norm
|
||||
#SHAPE = StringAttr.shape
|
||||
#LAST3 = StringAttr.last3
|
||||
|
||||
|
||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
||||
if attr == LEX:
|
||||
|
@ -133,37 +126,9 @@ cpdef double prob_of(size_t lex_id):
|
|||
return (<Lexeme*>lex_id).dist.prob
|
||||
|
||||
|
||||
cpdef bint is_oft_upper(size_t lex_id):
|
||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||
stores whether the lowered version of the string hashed by `lex' is found
|
||||
in all-upper case frequently in a large sample of text. Users are free
|
||||
to load different data, by default we use a sample from Wikipedia, with
|
||||
a threshold of 0.95, picked to maximize mutual information for POS tagging.
|
||||
|
||||
>>> is_oft_upper(lookup(u'abc'))
|
||||
True
|
||||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||
True
|
||||
'''
|
||||
return False
|
||||
#cdef Lexeme* w = <Lexeme*>lex_id
|
||||
#return w.orth.last3 if w.orth != NULL else 0
|
||||
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
|
||||
return (<Lexeme*>lex_id).orth.flags & (1 << flag)
|
||||
|
||||
|
||||
#return (<Lexeme*>lex_id).oft_upper
|
||||
|
||||
|
||||
cpdef bint is_oft_title(size_t lex_id):
|
||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||
stores whether the lowered version of the string hashed by `lex' is found
|
||||
title-cased frequently in a large sample of text. Users are free
|
||||
to load different data, by default we use a sample from Wikipedia, with
|
||||
a threshold of 0.3, picked to maximize mutual information for POS tagging.
|
||||
|
||||
>>> is_oft_title(lookup(u'marcus'))
|
||||
True
|
||||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||
True
|
||||
'''
|
||||
return False
|
||||
#return (<Lexeme*>lex_id).oft_title
|
||||
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
|
||||
return (<Lexeme*>lex_id).dist.flags & (1 << flag)
|
||||
|
|
|
@ -60,6 +60,7 @@ cdef class Language:
|
|||
self.chunks.set_empty_key(0)
|
||||
self.vocab.set_empty_key(0)
|
||||
self.load_tokenization(util.read_tokenization(name))
|
||||
self.load_dist_info(util.read_dist_info(name))
|
||||
|
||||
cdef Tokens tokenize(self, unicode string):
|
||||
cdef Lexeme** chunk
|
||||
|
@ -108,7 +109,8 @@ cdef class Language:
|
|||
word.lex = hash(string)
|
||||
self.bacov[word.lex] = string
|
||||
word.orth = self.new_orth(string)
|
||||
word.dist = self.new_dist(string)
|
||||
|
||||
word.dist = <Distribution*>calloc(1, sizeof(Distribution))
|
||||
self.vocab[word.lex] = <size_t>word
|
||||
return word
|
||||
|
||||
|
@ -135,13 +137,8 @@ cdef class Language:
|
|||
self.bacov[orth.last3] = last3
|
||||
self.bacov[orth.norm] = norm
|
||||
self.bacov[orth.shape] = shape
|
||||
|
||||
return orth
|
||||
|
||||
cdef Distribution* new_dist(self, unicode lex) except NULL:
|
||||
dist = <Distribution*>calloc(1, sizeof(Distribution))
|
||||
return dist
|
||||
|
||||
cdef unicode unhash(self, StringHash hash_value):
|
||||
'''Fetch a string from the reverse index, given its hash value.'''
|
||||
return self.bacov[hash_value]
|
||||
|
@ -164,21 +161,18 @@ cdef class Language:
|
|||
for chunk, tokens in token_rules:
|
||||
self.new_chunk(chunk, tokens)
|
||||
|
||||
def load_clusters(self):
|
||||
def load_dist_info(self, dist_info):
|
||||
cdef unicode string
|
||||
cdef dict word_dist
|
||||
cdef Lexeme* w
|
||||
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
|
||||
case_stats = util.load_case_stats(data_dir)
|
||||
brown_loc = path.join(data_dir, 'clusters')
|
||||
cdef size_t start
|
||||
cdef int end
|
||||
with util.utf8open(brown_loc) as browns_file:
|
||||
for i, line in enumerate(browns_file):
|
||||
cluster_str, token_string, freq_str = line.split()
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See redshift._parse_features.pyx
|
||||
cluster = int(cluster_str[::-1], 2)
|
||||
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
|
||||
self.new_lexeme(token_string)
|
||||
for string, word_dist in dist_info.items():
|
||||
w = self.lookup(string)
|
||||
w.prob = word_dist.prob
|
||||
w.cluster = word_dist.cluster
|
||||
for flag in word_dist.flags:
|
||||
w.flags |= lexeme.DIST_FLAGS[flag]
|
||||
for tag in word_dist.tagdict:
|
||||
w.tagdict |= lexeme.TAGS[tag]
|
||||
|
||||
|
||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||
|
|
|
@ -19,6 +19,12 @@ def load_case_stats(data_dir):
|
|||
return case_stats
|
||||
|
||||
|
||||
def load_dist_info(lang):
|
||||
with path.join(DATA_DIR, lang, 'distribution_info.json') as file_:
|
||||
dist_info = json.load(file_)
|
||||
return dist_info
|
||||
|
||||
|
||||
def read_tokenization(lang):
|
||||
loc = path.join(DATA_DIR, lang, 'tokenization')
|
||||
entries = []
|
||||
|
|
Loading…
Reference in New Issue
Block a user