mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately
This commit is contained in:
parent
bc9c62e279
commit
03fb498dbe
|
@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
|
||||||
else:
|
else:
|
||||||
file_ = loc.open()
|
file_ = loc.open()
|
||||||
for i, line in enumerate(file_):
|
for i, line in enumerate(file_):
|
||||||
freq, doc_freq, key = line.split('\t', 2)
|
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||||
freq = int(freq)
|
freq = int(freq)
|
||||||
counts.inc(i+1, freq)
|
counts.inc(i+1, freq)
|
||||||
total += freq
|
total += freq
|
||||||
|
@ -121,7 +121,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
|
||||||
file_ = loc.open()
|
file_ = loc.open()
|
||||||
probs = {}
|
probs = {}
|
||||||
for line in file_:
|
for line in file_:
|
||||||
freq, doc_freq, key = line.split('\t', 2)
|
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||||
doc_freq = int(doc_freq)
|
doc_freq = int(doc_freq)
|
||||||
freq = int(freq)
|
freq = int(freq)
|
||||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
||||||
|
|
5
setup.py
5
setup.py
|
@ -56,14 +56,15 @@ MOD_NAMES = [
|
||||||
'spacy.tokens.doc',
|
'spacy.tokens.doc',
|
||||||
'spacy.tokens.span',
|
'spacy.tokens.span',
|
||||||
'spacy.tokens.token',
|
'spacy.tokens.token',
|
||||||
'spacy.tokens.npchunks',
|
|
||||||
'spacy.serialize.packer',
|
'spacy.serialize.packer',
|
||||||
'spacy.serialize.huffman',
|
'spacy.serialize.huffman',
|
||||||
'spacy.serialize.bits',
|
'spacy.serialize.bits',
|
||||||
'spacy.cfile',
|
'spacy.cfile',
|
||||||
'spacy.matcher',
|
'spacy.matcher',
|
||||||
'spacy.syntax.ner',
|
'spacy.syntax.ner',
|
||||||
'spacy.symbols']
|
'spacy.symbols',
|
||||||
|
'spacy.en.iterators',
|
||||||
|
'spacy.de.iterators']
|
||||||
|
|
||||||
|
|
||||||
# By subclassing build_extensions we have the actual compiler that will be used
|
# By subclassing build_extensions we have the actual compiler that will be used
|
||||||
|
|
|
@ -14,12 +14,12 @@ cpdef enum attr_id_t:
|
||||||
LIKE_EMAIL
|
LIKE_EMAIL
|
||||||
IS_STOP
|
IS_STOP
|
||||||
IS_OOV
|
IS_OOV
|
||||||
|
IS_BRACKET
|
||||||
FLAG14 = 14
|
IS_QUOTE
|
||||||
FLAG15
|
IS_LEFT_PUNCT
|
||||||
FLAG16
|
IS_RIGHT_PUNCT
|
||||||
FLAG17
|
|
||||||
FLAG18
|
FLAG18 = 18
|
||||||
FLAG19
|
FLAG19
|
||||||
FLAG20
|
FLAG20
|
||||||
FLAG21
|
FLAG21
|
||||||
|
@ -85,11 +85,7 @@ cpdef enum attr_id_t:
|
||||||
HEAD
|
HEAD
|
||||||
SPACY
|
SPACY
|
||||||
PROB
|
PROB
|
||||||
|
|
||||||
|
LANG
|
||||||
|
|
||||||
# Move these up to FLAG14--FLAG18 once we finish the functionality and
|
|
||||||
# are ready to regenerate the model
|
|
||||||
#IS_BRACKET
|
|
||||||
#IS_QUOTE
|
|
||||||
#IS_LEFT_PUNCT
|
|
||||||
#IS_RIGHT_PUNCT
|
|
||||||
|
|
||||||
|
|
|
@ -13,10 +13,10 @@ IDS = {
|
||||||
"LIKE_EMAIL": LIKE_EMAIL,
|
"LIKE_EMAIL": LIKE_EMAIL,
|
||||||
"IS_STOP": IS_STOP,
|
"IS_STOP": IS_STOP,
|
||||||
"IS_OOV": IS_OOV,
|
"IS_OOV": IS_OOV,
|
||||||
"FLAG14": FLAG14,
|
"IS_BRACKET": IS_BRACKET,
|
||||||
"FLAG15": FLAG15,
|
"IS_QUOTE": IS_QUOTE,
|
||||||
"FLAG16": FLAG16,
|
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||||
"FLAG17": FLAG17,
|
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
||||||
"FLAG18": FLAG18,
|
"FLAG18": FLAG18,
|
||||||
"FLAG19": FLAG19,
|
"FLAG19": FLAG19,
|
||||||
"FLAG20": FLAG20,
|
"FLAG20": FLAG20,
|
||||||
|
@ -83,6 +83,7 @@ IDS = {
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
|
"LANG": LANG,
|
||||||
}
|
}
|
||||||
|
|
||||||
# ATTR IDs, in order of the symbol
|
# ATTR IDs, in order of the symbol
|
||||||
|
|
|
@ -1,31 +1,9 @@
|
||||||
|
from spacy.structs cimport TokenC
|
||||||
|
from spacy.tokens.span cimport Span
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from spacy.parts_of_speech cimport NOUN
|
||||||
from .doc cimport Doc
|
|
||||||
from .span cimport Span
|
|
||||||
|
|
||||||
from ..parts_of_speech cimport NOUN, PROPN, PRON
|
def noun_chunks(Span sent):
|
||||||
|
|
||||||
def english(Span sent):
|
|
||||||
cdef const TokenC* word
|
|
||||||
strings = sent.doc.vocab.strings
|
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
|
||||||
np_deps = [strings[label] for label in labels]
|
|
||||||
conj = strings['conj']
|
|
||||||
np_label = strings['NP']
|
|
||||||
for i in range(sent.start, sent.end):
|
|
||||||
word = &sent.doc.c[i]
|
|
||||||
if word.pos == NOUN and word.dep in np_deps:
|
|
||||||
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
|
|
||||||
elif word.pos == NOUN and word.dep == conj:
|
|
||||||
head = word+word.head
|
|
||||||
while head.dep == conj and head.head < 0:
|
|
||||||
head += head.head
|
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
||||||
if head.dep in np_deps:
|
|
||||||
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
|
|
||||||
|
|
||||||
|
|
||||||
def german(Span sent):
|
|
||||||
# this function extracts spans headed by NOUNs starting from the left-most
|
# this function extracts spans headed by NOUNs starting from the left-most
|
||||||
# syntactic dependent until the NOUN itself
|
# syntactic dependent until the NOUN itself
|
||||||
# for close apposition and measurement construction, the span is sometimes
|
# for close apposition and measurement construction, the span is sometimes
|
||||||
|
@ -48,7 +26,3 @@ def german(Span sent):
|
||||||
if rdep.pos == NOUN and rdep.dep == close_app:
|
if rdep.pos == NOUN and rdep.dep == close_app:
|
||||||
rbracket = rdep.i+1
|
rbracket = rdep.i+1
|
||||||
yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
|
yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
0
spacy/en/iterators.pxd
Normal file
0
spacy/en/iterators.pxd
Normal file
24
spacy/en/iterators.pyx
Normal file
24
spacy/en/iterators.pyx
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
from spacy.structs cimport TokenC
|
||||||
|
from spacy.tokens.span cimport Span
|
||||||
|
|
||||||
|
from spacy.parts_of_speech cimport NOUN
|
||||||
|
|
||||||
|
def noun_chunks(Span sent):
|
||||||
|
cdef const TokenC* word
|
||||||
|
strings = sent.doc.vocab.strings
|
||||||
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
||||||
|
np_deps = [strings[label] for label in labels]
|
||||||
|
conj = strings['conj']
|
||||||
|
np_label = strings['NP']
|
||||||
|
for i in range(sent.start, sent.end):
|
||||||
|
word = &sent.doc.c[i]
|
||||||
|
if word.pos == NOUN and word.dep in np_deps:
|
||||||
|
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
|
||||||
|
elif word.pos == NOUN and word.dep == conj:
|
||||||
|
head = word+word.head
|
||||||
|
while head.dep == conj and head.head < 0:
|
||||||
|
head += head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
|
||||||
|
|
|
@ -69,6 +69,7 @@ class Language(object):
|
||||||
attrs.SUFFIX: cls.suffix,
|
attrs.SUFFIX: cls.suffix,
|
||||||
attrs.CLUSTER: cls.cluster,
|
attrs.CLUSTER: cls.cluster,
|
||||||
attrs.PROB: lambda string: oov_prob,
|
attrs.PROB: lambda string: oov_prob,
|
||||||
|
attrs.LANG: lambda string: cls.lang,
|
||||||
attrs.IS_ALPHA: orth.is_alpha,
|
attrs.IS_ALPHA: orth.is_alpha,
|
||||||
attrs.IS_ASCII: orth.is_ascii,
|
attrs.IS_ASCII: orth.is_ascii,
|
||||||
attrs.IS_DIGIT: cls.is_digit,
|
attrs.IS_DIGIT: cls.is_digit,
|
||||||
|
@ -77,10 +78,10 @@ class Language(object):
|
||||||
attrs.IS_SPACE: cls.is_space,
|
attrs.IS_SPACE: cls.is_space,
|
||||||
attrs.IS_TITLE: orth.is_title,
|
attrs.IS_TITLE: orth.is_title,
|
||||||
attrs.IS_UPPER: orth.is_upper,
|
attrs.IS_UPPER: orth.is_upper,
|
||||||
attrs.FLAG14: orth.is_bracket,
|
attrs.IS_BRACKET: orth.is_bracket,
|
||||||
attrs.FLAG15: orth.is_quote,
|
attrs.IS_QUOTE: orth.is_quote,
|
||||||
attrs.FLAG16: orth.is_left_punct,
|
attrs.IS_LEFT_PUNCT: orth.is_left_punct,
|
||||||
attrs.FLAG17: orth.is_right_punct,
|
attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
|
||||||
attrs.LIKE_URL: orth.like_url,
|
attrs.LIKE_URL: orth.like_url,
|
||||||
attrs.LIKE_NUM: orth.like_number,
|
attrs.LIKE_NUM: orth.like_number,
|
||||||
attrs.LIKE_EMAIL: orth.like_email,
|
attrs.LIKE_EMAIL: orth.like_email,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
|
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
|
||||||
from .attrs cimport attr_id_t
|
from .attrs cimport attr_id_t
|
||||||
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
|
||||||
|
|
||||||
from .structs cimport LexemeC
|
from .structs cimport LexemeC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
@ -41,6 +41,8 @@ cdef class Lexeme:
|
||||||
lex.suffix = value
|
lex.suffix = value
|
||||||
elif name == CLUSTER:
|
elif name == CLUSTER:
|
||||||
lex.cluster = value
|
lex.cluster = value
|
||||||
|
elif name == LANG:
|
||||||
|
lex.lang = value
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
|
@ -67,6 +69,8 @@ cdef class Lexeme:
|
||||||
return lex.length
|
return lex.length
|
||||||
elif feat_name == CLUSTER:
|
elif feat_name == CLUSTER:
|
||||||
return lex.cluster
|
return lex.cluster
|
||||||
|
elif feat_name == LANG:
|
||||||
|
return lex.lang
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
|
@ -18,10 +18,10 @@ import numpy
|
||||||
|
|
||||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from .attrs cimport FLAG14 as IS_BRACKET
|
from .attrs cimport IS_BRACKET
|
||||||
from .attrs cimport FLAG15 as IS_QUOTE
|
from .attrs cimport IS_QUOTE
|
||||||
from .attrs cimport FLAG16 as IS_LEFT_PUNCT
|
from .attrs cimport IS_LEFT_PUNCT
|
||||||
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
|
from .attrs cimport IS_RIGHT_PUNCT
|
||||||
from .attrs cimport IS_OOV
|
from .attrs cimport IS_OOV
|
||||||
|
|
||||||
|
|
||||||
|
@ -123,6 +123,10 @@ cdef class Lexeme:
|
||||||
def __get__(self): return self.c.cluster
|
def __get__(self): return self.c.cluster
|
||||||
def __set__(self, int x): self.c.cluster = x
|
def __set__(self, int x): self.c.cluster = x
|
||||||
|
|
||||||
|
property lang:
|
||||||
|
def __get__(self): return self.c.lang
|
||||||
|
def __set__(self, int x): self.c.lang = x
|
||||||
|
|
||||||
property prob:
|
property prob:
|
||||||
def __get__(self): return self.c.prob
|
def __get__(self): return self.c.prob
|
||||||
def __set__(self, float x): self.c.prob = x
|
def __set__(self, float x): self.c.prob = x
|
||||||
|
@ -147,6 +151,10 @@ cdef class Lexeme:
|
||||||
def __get__(self): return self.vocab.strings[self.c.suffix]
|
def __get__(self): return self.vocab.strings[self.c.suffix]
|
||||||
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
|
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
|
||||||
|
|
||||||
|
property lang_:
|
||||||
|
def __get__(self): return self.vocab.strings[self.c.lang]
|
||||||
|
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
|
||||||
|
|
||||||
property flags:
|
property flags:
|
||||||
def __get__(self): return self.c.flags
|
def __get__(self): return self.c.flags
|
||||||
def __set__(self, flags_t x): self.c.flags = x
|
def __set__(self, flags_t x): self.c.flags = x
|
||||||
|
|
|
@ -40,17 +40,17 @@ cpdef bint is_bracket(unicode string):
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_quote(unicode string):
|
cpdef bint is_quote(unicode string):
|
||||||
quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯')
|
quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯',"''",'``')
|
||||||
return string in quotes
|
return string in quotes
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_left_punct(unicode string):
|
cpdef bint is_left_punct(unicode string):
|
||||||
left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮')
|
left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮','``')
|
||||||
return string in left_punct
|
return string in left_punct
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_right_punct(unicode string):
|
cpdef bint is_right_punct(unicode string):
|
||||||
right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯')
|
right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯',"''")
|
||||||
return string in right_punct
|
return string in right_punct
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,8 @@ cdef struct LexemeC:
|
||||||
|
|
||||||
flags_t flags
|
flags_t flags
|
||||||
|
|
||||||
|
attr_t lang
|
||||||
|
|
||||||
attr_t id
|
attr_t id
|
||||||
attr_t length
|
attr_t length
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ import struct
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import math
|
import math
|
||||||
import six
|
import six
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
|
@ -23,7 +24,6 @@ from .token cimport Token
|
||||||
from ..serialize.bits cimport BitArray
|
from ..serialize.bits cimport BitArray
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
|
|
||||||
import npchunks
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
@ -241,11 +241,23 @@ cdef class Doc:
|
||||||
"\npython -m spacy.en.download all\n"
|
"\npython -m spacy.en.download all\n"
|
||||||
"to install the data")
|
"to install the data")
|
||||||
|
|
||||||
chunk_rules = {'en':npchunks.english, 'de':npchunks.german}
|
from spacy.en.iterators import noun_chunks as en_noun_chunks
|
||||||
|
from spacy.de.iterators import noun_chunks as de_noun_chunks
|
||||||
|
|
||||||
|
chunk_rules = {'en':en_noun_chunks,
|
||||||
|
'de':de_noun_chunks,
|
||||||
|
}
|
||||||
|
|
||||||
for sent in self.sents:
|
for sent in self.sents:
|
||||||
lang = 'en' # todo: make dependent on language of root token
|
print(sent)
|
||||||
for chunk in chunk_rules.get(lang)(sent):
|
lang = sent.root.lang_
|
||||||
|
chunker = chunk_rules.get(lang,None)
|
||||||
|
if chunker == None:
|
||||||
|
warnings.warn("noun_chunks is not available for language %s." % lang)
|
||||||
|
print(sent.root.orth_)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for chunk in chunker(sent):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT
|
from ..parts_of_speech cimport CONJ, PUNCT
|
||||||
|
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport FLAG14 as IS_BRACKET
|
from ..attrs cimport IS_BRACKET
|
||||||
from ..attrs cimport FLAG15 as IS_QUOTE
|
from ..attrs cimport IS_QUOTE
|
||||||
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
|
from ..attrs cimport IS_LEFT_PUNCT
|
||||||
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
|
from ..attrs cimport IS_RIGHT_PUNCT
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from ..attrs cimport IS_OOV
|
from ..attrs cimport IS_OOV
|
||||||
|
|
||||||
|
@ -95,6 +95,10 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.prob
|
return self.c.lex.prob
|
||||||
|
|
||||||
|
property lang:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.lex.lang
|
||||||
|
|
||||||
property idx:
|
property idx:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.idx
|
return self.c.idx
|
||||||
|
@ -310,6 +314,10 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.suffix]
|
return self.vocab.strings[self.c.lex.suffix]
|
||||||
|
|
||||||
|
property lang_:
|
||||||
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.lex.lang]
|
||||||
|
|
||||||
property lemma_:
|
property lemma_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lemma]
|
||||||
|
|
|
@ -246,6 +246,7 @@ cdef class Vocab:
|
||||||
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
|
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
|
||||||
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
|
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
|
||||||
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
|
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
|
||||||
|
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def load_lexemes(self, loc):
|
def load_lexemes(self, loc):
|
||||||
|
@ -278,6 +279,7 @@ cdef class Vocab:
|
||||||
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
|
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
|
||||||
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
|
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
|
||||||
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
||||||
|
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
|
||||||
|
|
||||||
lexeme.vector = EMPTY_VEC
|
lexeme.vector = EMPTY_VEC
|
||||||
py_str = self.strings[lexeme.orth]
|
py_str = self.strings[lexeme.orth]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user