mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Reforming data model for lexemes
This commit is contained in:
parent
e091b6a241
commit
3379d7a571
30
spacy/en.pyx
30
spacy/en.pyx
|
@ -64,3 +64,33 @@ cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
return EN.unhash(hash_value)
|
return EN.unhash(hash_value)
|
||||||
|
|
||||||
|
|
||||||
|
cpdef bint is_oft_upper(size_t lex_id):
|
||||||
|
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||||
|
stores whether the lowered version of the string hashed by `lex' is found
|
||||||
|
in all-upper case frequently in a large sample of text. Users are free
|
||||||
|
to load different data, by default we use a sample from Wikipedia, with
|
||||||
|
a threshold of 0.95, picked to maximize mutual information for POS tagging.
|
||||||
|
|
||||||
|
>>> is_oft_upper(lookup(u'abc'))
|
||||||
|
True
|
||||||
|
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||||
|
True
|
||||||
|
'''
|
||||||
|
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
|
||||||
|
|
||||||
|
|
||||||
|
cpdef bint is_oft_title(size_t lex_id):
|
||||||
|
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||||
|
stores whether the lowered version of the string hashed by `lex' is found
|
||||||
|
title-cased frequently in a large sample of text. Users are free
|
||||||
|
to load different data, by default we use a sample from Wikipedia, with
|
||||||
|
a threshold of 0.3, picked to maximize mutual information for POS tagging.
|
||||||
|
|
||||||
|
>>> is_oft_title(lookup(u'marcus'))
|
||||||
|
True
|
||||||
|
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||||
|
True
|
||||||
|
'''
|
||||||
|
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
|
||||||
|
|
|
@ -9,13 +9,35 @@ ctypedef char Bits8
|
||||||
ctypedef uint64_t Bits64
|
ctypedef uint64_t Bits64
|
||||||
|
|
||||||
|
|
||||||
|
cdef enum OrthFlag:
|
||||||
|
IS_ALPHA
|
||||||
|
IS_DIGIT
|
||||||
|
IS_PUNCT
|
||||||
|
IS_WHITE
|
||||||
|
IS_LOWER
|
||||||
|
IS_UPPER
|
||||||
|
IS_TITLE
|
||||||
|
IS_ASCII
|
||||||
|
|
||||||
|
|
||||||
|
cdef enum DistFlag:
|
||||||
|
OFT_UPPER
|
||||||
|
OFT_TITLE
|
||||||
|
DIST_FLAG3
|
||||||
|
DIST_FLAG4
|
||||||
|
DIST_FLAG5
|
||||||
|
DIST_FLAG6
|
||||||
|
DIST_FLAG7
|
||||||
|
DIST_FLAG8
|
||||||
|
|
||||||
|
|
||||||
cdef struct Orthography:
|
cdef struct Orthography:
|
||||||
StringHash last3
|
StringHash last3
|
||||||
StringHash shape
|
StringHash shape
|
||||||
StringHash norm
|
StringHash norm
|
||||||
|
|
||||||
size_t length
|
size_t length
|
||||||
unsigned char first
|
Py_UNICODE first
|
||||||
Bits8 flags
|
Bits8 flags
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,15 +49,12 @@ cdef struct Distribution:
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
StringHash sic # Hash of the original string
|
StringHash lex # Hash of the word
|
||||||
StringHash lex # Hash of the word, with punctuation and clitics split off
|
|
||||||
|
|
||||||
Distribution* dist # Distribution info, lazy loaded
|
|
||||||
Orthography* orth # Extra orthographic views
|
Orthography* orth # Extra orthographic views
|
||||||
#Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
Distribution* dist # Distribution info
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)
|
||||||
|
|
||||||
|
|
||||||
cdef enum StringAttr:
|
cdef enum StringAttr:
|
||||||
|
@ -49,7 +68,16 @@ cdef enum StringAttr:
|
||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
||||||
|
|
||||||
cpdef StringHash lex_of(size_t lex_id) except 0
|
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||||
|
|
||||||
cpdef StringHash norm_of(size_t lex_id) except 0
|
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||||
cpdef StringHash shape_of(size_t lex_id) except 0
|
cpdef StringHash shape_of(size_t lex_id) except 0
|
||||||
cpdef StringHash last3_of(size_t lex_id) except 0
|
cpdef StringHash last3_of(size_t lex_id) except 0
|
||||||
cpdef StringHash length_of(size_t lex_id)
|
|
||||||
|
cpdef size_t length_of(size_t lex_id) except *
|
||||||
|
cpdef Py_UNICODE first_of(size_t lex_id) except *
|
||||||
|
|
||||||
|
cpdef double prob_of(size_t lex_id) except 0
|
||||||
|
cpdef ClusterID cluster_of(size_t lex_id) except 0
|
||||||
|
|
||||||
|
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
|
||||||
|
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
|
||||||
|
|
|
@ -13,13 +13,6 @@ from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
# Reiterate the enum, for python
|
|
||||||
#SIC = StringAttr.sic
|
|
||||||
#LEX = StringAttr.lex
|
|
||||||
#NORM = StringAttr.norm
|
|
||||||
#SHAPE = StringAttr.shape
|
|
||||||
#LAST3 = StringAttr.last3
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
||||||
if attr == LEX:
|
if attr == LEX:
|
||||||
|
@ -133,37 +126,9 @@ cpdef double prob_of(size_t lex_id):
|
||||||
return (<Lexeme*>lex_id).dist.prob
|
return (<Lexeme*>lex_id).dist.prob
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_upper(size_t lex_id):
|
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
|
||||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
return (<Lexeme*>lex_id).orth.flags & (1 << flag)
|
||||||
stores whether the lowered version of the string hashed by `lex' is found
|
|
||||||
in all-upper case frequently in a large sample of text. Users are free
|
|
||||||
to load different data, by default we use a sample from Wikipedia, with
|
|
||||||
a threshold of 0.95, picked to maximize mutual information for POS tagging.
|
|
||||||
|
|
||||||
>>> is_oft_upper(lookup(u'abc'))
|
|
||||||
True
|
|
||||||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
|
||||||
True
|
|
||||||
'''
|
|
||||||
return False
|
|
||||||
#cdef Lexeme* w = <Lexeme*>lex_id
|
|
||||||
#return w.orth.last3 if w.orth != NULL else 0
|
|
||||||
|
|
||||||
|
|
||||||
#return (<Lexeme*>lex_id).oft_upper
|
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
|
||||||
|
return (<Lexeme*>lex_id).dist.flags & (1 << flag)
|
||||||
|
|
||||||
cpdef bint is_oft_title(size_t lex_id):
|
|
||||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
|
||||||
stores whether the lowered version of the string hashed by `lex' is found
|
|
||||||
title-cased frequently in a large sample of text. Users are free
|
|
||||||
to load different data, by default we use a sample from Wikipedia, with
|
|
||||||
a threshold of 0.3, picked to maximize mutual information for POS tagging.
|
|
||||||
|
|
||||||
>>> is_oft_title(lookup(u'marcus'))
|
|
||||||
True
|
|
||||||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
|
||||||
True
|
|
||||||
'''
|
|
||||||
return False
|
|
||||||
#return (<Lexeme*>lex_id).oft_title
|
|
||||||
|
|
|
@ -60,6 +60,7 @@ cdef class Language:
|
||||||
self.chunks.set_empty_key(0)
|
self.chunks.set_empty_key(0)
|
||||||
self.vocab.set_empty_key(0)
|
self.vocab.set_empty_key(0)
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
|
self.load_dist_info(util.read_dist_info(name))
|
||||||
|
|
||||||
cdef Tokens tokenize(self, unicode string):
|
cdef Tokens tokenize(self, unicode string):
|
||||||
cdef Lexeme** chunk
|
cdef Lexeme** chunk
|
||||||
|
@ -108,7 +109,8 @@ cdef class Language:
|
||||||
word.lex = hash(string)
|
word.lex = hash(string)
|
||||||
self.bacov[word.lex] = string
|
self.bacov[word.lex] = string
|
||||||
word.orth = self.new_orth(string)
|
word.orth = self.new_orth(string)
|
||||||
word.dist = self.new_dist(string)
|
|
||||||
|
word.dist = <Distribution*>calloc(1, sizeof(Distribution))
|
||||||
self.vocab[word.lex] = <size_t>word
|
self.vocab[word.lex] = <size_t>word
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
@ -135,13 +137,8 @@ cdef class Language:
|
||||||
self.bacov[orth.last3] = last3
|
self.bacov[orth.last3] = last3
|
||||||
self.bacov[orth.norm] = norm
|
self.bacov[orth.norm] = norm
|
||||||
self.bacov[orth.shape] = shape
|
self.bacov[orth.shape] = shape
|
||||||
|
|
||||||
return orth
|
return orth
|
||||||
|
|
||||||
cdef Distribution* new_dist(self, unicode lex) except NULL:
|
|
||||||
dist = <Distribution*>calloc(1, sizeof(Distribution))
|
|
||||||
return dist
|
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hash_value):
|
cdef unicode unhash(self, StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
return self.bacov[hash_value]
|
return self.bacov[hash_value]
|
||||||
|
@ -164,21 +161,18 @@ cdef class Language:
|
||||||
for chunk, tokens in token_rules:
|
for chunk, tokens in token_rules:
|
||||||
self.new_chunk(chunk, tokens)
|
self.new_chunk(chunk, tokens)
|
||||||
|
|
||||||
def load_clusters(self):
|
def load_dist_info(self, dist_info):
|
||||||
|
cdef unicode string
|
||||||
|
cdef dict word_dist
|
||||||
cdef Lexeme* w
|
cdef Lexeme* w
|
||||||
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
|
for string, word_dist in dist_info.items():
|
||||||
case_stats = util.load_case_stats(data_dir)
|
w = self.lookup(string)
|
||||||
brown_loc = path.join(data_dir, 'clusters')
|
w.prob = word_dist.prob
|
||||||
cdef size_t start
|
w.cluster = word_dist.cluster
|
||||||
cdef int end
|
for flag in word_dist.flags:
|
||||||
with util.utf8open(brown_loc) as browns_file:
|
w.flags |= lexeme.DIST_FLAGS[flag]
|
||||||
for i, line in enumerate(browns_file):
|
for tag in word_dist.tagdict:
|
||||||
cluster_str, token_string, freq_str = line.split()
|
w.tagdict |= lexeme.TAGS[tag]
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
|
||||||
# the first 4 bits. See redshift._parse_features.pyx
|
|
||||||
cluster = int(cluster_str[::-1], 2)
|
|
||||||
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
|
|
||||||
self.new_lexeme(token_string)
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
|
|
|
@ -19,6 +19,12 @@ def load_case_stats(data_dir):
|
||||||
return case_stats
|
return case_stats
|
||||||
|
|
||||||
|
|
||||||
|
def load_dist_info(lang):
|
||||||
|
with path.join(DATA_DIR, lang, 'distribution_info.json') as file_:
|
||||||
|
dist_info = json.load(file_)
|
||||||
|
return dist_info
|
||||||
|
|
||||||
|
|
||||||
def read_tokenization(lang):
|
def read_tokenization(lang):
|
||||||
loc = path.join(DATA_DIR, lang, 'tokenization')
|
loc = path.join(DATA_DIR, lang, 'tokenization')
|
||||||
entries = []
|
entries = []
|
||||||
|
|
Loading…
Reference in New Issue
Block a user