mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Work on word vectors, and other stuff
This commit is contained in:
parent
7e69e17161
commit
6c7e44140b
|
@ -1,5 +1,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from os import path
|
from os import path
|
||||||
|
import re
|
||||||
|
|
||||||
from .. import orth
|
from .. import orth
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
@ -11,6 +12,9 @@ from .pos import POS_TAGS
|
||||||
from .attrs import get_flags
|
from .attrs import get_flags
|
||||||
|
|
||||||
|
|
||||||
|
from ..util import read_lang_data
|
||||||
|
|
||||||
|
|
||||||
def get_lex_props(string):
|
def get_lex_props(string):
|
||||||
return {
|
return {
|
||||||
'flags': get_flags(string),
|
'flags': get_flags(string),
|
||||||
|
@ -64,10 +68,15 @@ class English(object):
|
||||||
tag_names = list(POS_TAGS.keys())
|
tag_names = list(POS_TAGS.keys())
|
||||||
tag_names.sort()
|
tag_names.sort()
|
||||||
if data_dir is None:
|
if data_dir is None:
|
||||||
self.tokenizer = Tokenizer(self.vocab, {}, None, None, None,
|
tok_rules = {}
|
||||||
POS_TAGS, tag_names)
|
prefix_re = None
|
||||||
|
suffix_re = None
|
||||||
|
infix_re = None
|
||||||
else:
|
else:
|
||||||
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
|
tok_data_dir = path.join(data_dir, 'tokenizer')
|
||||||
|
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
||||||
|
self.tokenizer = Tokenizer(self.vocab, tok_rules, re.compile(prefix_re),
|
||||||
|
re.compile(suffix_re), re.compile(infix_re),
|
||||||
POS_TAGS, tag_names)
|
POS_TAGS, tag_names)
|
||||||
self.strings = self.vocab.strings
|
self.strings = self.vocab.strings
|
||||||
self._tagger = None
|
self._tagger = None
|
||||||
|
@ -100,11 +109,11 @@ class English(object):
|
||||||
Returns:
|
Returns:
|
||||||
tokens (spacy.tokens.Tokens):
|
tokens (spacy.tokens.Tokens):
|
||||||
"""
|
"""
|
||||||
tokens = self.tokenizer.tokenize(text)
|
tokens = self.tokenizer(text)
|
||||||
if tag:
|
if tag:
|
||||||
self.tagger(tokens)
|
self.tagger(tokens)
|
||||||
if parse:
|
if parse:
|
||||||
self.parser.parse(tokens)
|
self.parser(tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -1,18 +1,16 @@
|
||||||
from ..typedefs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
||||||
from ..typedefs cimport FLAG8, FLAG9
|
from ..attrs cimport FLAG8, FLAG9
|
||||||
from ..typedefs cimport ID as _ID
|
from ..attrs cimport SIC as _SIC
|
||||||
from ..typedefs cimport SIC as _SIC
|
from ..attrs cimport SHAPE as _SHAPE
|
||||||
from ..typedefs cimport SHAPE as _SHAPE
|
from ..attrs cimport NORM1 as _NORM1
|
||||||
from ..typedefs cimport NORM1 as _NORM1
|
from ..attrs cimport NORM2 as _NORM2
|
||||||
from ..typedefs cimport NORM2 as _NORM2
|
from ..attrs cimport CLUSTER as _CLUSTER
|
||||||
from ..typedefs cimport CLUSTER as _CLUSTER
|
from ..attrs cimport PREFIX as _PREFIX
|
||||||
from ..typedefs cimport PREFIX as _PREFIX
|
from ..attrs cimport SUFFIX as _SUFFIX
|
||||||
from ..typedefs cimport SUFFIX as _SUFFIX
|
from ..attrs cimport LEMMA as _LEMMA
|
||||||
from ..typedefs cimport LEMMA as _LEMMA
|
from ..attrs cimport POS as _POS
|
||||||
from ..typedefs cimport POS as _POS
|
|
||||||
|
|
||||||
|
|
||||||
# Work around the lack of global cpdef variables
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
IS_ALPHA = FLAG0
|
IS_ALPHA = FLAG0
|
||||||
IS_ASCII = FLAG1
|
IS_ASCII = FLAG1
|
||||||
|
@ -25,7 +23,6 @@ cpdef enum:
|
||||||
LIKE_URL = FLAG8
|
LIKE_URL = FLAG8
|
||||||
LIKE_NUM = FLAG9
|
LIKE_NUM = FLAG9
|
||||||
|
|
||||||
ID = _ID
|
|
||||||
SIC = _SIC
|
SIC = _SIC
|
||||||
SHAPE = _SHAPE
|
SHAPE = _SHAPE
|
||||||
NORM1 = _NORM1
|
NORM1 = _NORM1
|
||||||
|
|
|
@ -4,33 +4,52 @@ import tarfile
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
PARSER_URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
|
PARSER_URL = 'http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
|
||||||
|
|
||||||
DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps')
|
DEP_VECTORS_URL = 'http://u.cs.biu.ac.il/~yogo/data/syntemb/deps.words.bz2'
|
||||||
|
|
||||||
|
DEST_DIR = path.join(path.dirname(__file__), 'data')
|
||||||
|
|
||||||
def download_file(url):
|
def download_file(url):
|
||||||
local_filename = url.split('/')[-1]
|
local_filename = url.split('/')[-1]
|
||||||
|
return path.join(DEST_DIR, local_filename)
|
||||||
# NOTE the stream=True parameter
|
# NOTE the stream=True parameter
|
||||||
r = requests.get(url, stream=True)
|
r = requests.get(url, stream=True)
|
||||||
|
print "Download %s" % url
|
||||||
|
i = 0
|
||||||
with open(local_filename, 'wb') as f:
|
with open(local_filename, 'wb') as f:
|
||||||
for chunk in r.iter_content(chunk_size=1024):
|
for chunk in r.iter_content(chunk_size=1024):
|
||||||
if chunk: # filter out keep-alive new chunks
|
if chunk: # filter out keep-alive new chunks
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
f.flush()
|
f.flush()
|
||||||
|
print i
|
||||||
|
i += 1
|
||||||
return local_filename
|
return local_filename
|
||||||
|
|
||||||
def main():
|
def install_parser_model(url, dest_dir):
|
||||||
if not os.path.exists(DEST_DIR):
|
if not os.path.exists(dest_dir):
|
||||||
os.mkdir(DEST_DIR)
|
os.mkdir(dest_dir)
|
||||||
assert not path.exists(path.join(DEST_DIR, 'en'))
|
assert not path.exists(path.join(dest_dir, 'en'))
|
||||||
|
|
||||||
|
filename = download_file(url)
|
||||||
filename = download_file(URL)
|
|
||||||
t = tarfile.open(filename, mode=":gz")
|
t = tarfile.open(filename, mode=":gz")
|
||||||
t.extractall(DEST_DIR)
|
t.extractall(dest_dir)
|
||||||
shutil.move(path.join(DEST_DIR, 'en', 'deps', 'model'), DEST_DIR)
|
shutil.move(path.join(dest_dir, 'en', 'deps', 'model'), dest_dir)
|
||||||
shutil.move(path.join(DEST_DIR, 'en', 'deps', 'config.json'), DEST_DIR)
|
shutil.move(path.join(dest_dir, 'en', 'deps', 'config.json'), dest_dir)
|
||||||
shutil.rmtree(path.join(DEST_DIR, 'en'))
|
shutil.rmtree(path.join(dest_dir, 'en'))
|
||||||
|
|
||||||
|
|
||||||
|
def install_dep_vectors(url, dest_dir):
|
||||||
|
if not os.path.exists(dest_dir):
|
||||||
|
os.mkdir(dest_dir)
|
||||||
|
|
||||||
|
filename = download_file(url)
|
||||||
|
shutil.move(filename, path.join(dest_dir, 'vec.bz2'))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
#install_parser_model(PARSER_URL, path.join(DEST_DIR, 'deps'))
|
||||||
|
install_dep_vectors(DEP_VECTORS_URL, path.join(DEST_DIR, 'vocab'))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -247,11 +247,12 @@ cdef class EnPosTagger:
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
cdef const weight_t* scores
|
cdef const weight_t* scores
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
if tokens.data[i].fine_pos == 0:
|
if tokens.data[i].pos == 0:
|
||||||
fill_context(context, i, tokens.data)
|
fill_context(context, i, tokens.data)
|
||||||
scores = self.model.score(context)
|
scores = self.model.score(context)
|
||||||
tokens.data[i].fine_pos = arg_max(scores, self.model.n_classes)
|
tokens.data[i].tag = arg_max(scores, self.model.n_classes)
|
||||||
self.set_morph(i, tokens.data)
|
self.set_morph(i, tokens.data)
|
||||||
|
tokens.pos_scheme = self.tag_map
|
||||||
|
|
||||||
def train(self, Tokens tokens, object golds):
|
def train(self, Tokens tokens, object golds):
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -263,13 +264,13 @@ cdef class EnPosTagger:
|
||||||
scores = self.model.score(context)
|
scores = self.model.score(context)
|
||||||
guess = arg_max(scores, self.model.n_classes)
|
guess = arg_max(scores, self.model.n_classes)
|
||||||
self.model.update(context, guess, golds[i], guess != golds[i])
|
self.model.update(context, guess, golds[i], guess != golds[i])
|
||||||
tokens.data[i].fine_pos = guess
|
tokens.data[i].tag = guess
|
||||||
self.set_morph(i, tokens.data)
|
self.set_morph(i, tokens.data)
|
||||||
correct += guess == golds[i]
|
correct += guess == golds[i]
|
||||||
return correct
|
return correct
|
||||||
|
|
||||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||||
cdef const PosTag* tag = &self.tags[tokens[i].fine_pos]
|
cdef const PosTag* tag = &self.tags[tokens[i].tag]
|
||||||
tokens[i].pos = tag.pos
|
tokens[i].pos = tag.pos
|
||||||
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
|
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
|
||||||
if cached is NULL:
|
if cached is NULL:
|
||||||
|
|
|
@ -3,39 +3,70 @@ from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLU
|
||||||
from .structs cimport LexemeC
|
from .structs cimport LexemeC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
from numpy cimport ndarray
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef LexemeC EMPTY_LEXEME
|
cdef LexemeC EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1
|
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
|
||||||
|
const float* empty_vec) except -1
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
cdef const float* vec
|
cdef readonly ndarray vec
|
||||||
|
|
||||||
cdef readonly flags_t flags
|
cdef readonly flags_t flags
|
||||||
cdef readonly attr_t id
|
cdef readonly attr_t id
|
||||||
cdef readonly attr_t length
|
cdef readonly attr_t length
|
||||||
|
|
||||||
cdef readonly attr_t sic
|
cdef readonly attr_t sic
|
||||||
cdef readonly unicode norm1
|
cdef readonly attr_t norm1
|
||||||
cdef readonly unicode norm2
|
cdef readonly attr_t norm2
|
||||||
cdef readonly unicode shape
|
cdef readonly attr_t shape
|
||||||
cdef readonly unicode prefix
|
cdef readonly attr_t prefix
|
||||||
cdef readonly unicode suffix
|
cdef readonly attr_t suffix
|
||||||
|
|
||||||
cdef readonly attr_t sic_id
|
cdef readonly unicode sic_
|
||||||
cdef readonly attr_t norm1_id
|
cdef readonly unicode norm1_
|
||||||
cdef readonly attr_t norm2_id
|
cdef readonly unicode norm2_
|
||||||
cdef readonly attr_t shape_id
|
cdef readonly unicode shape_
|
||||||
cdef readonly attr_t prefix_id
|
cdef readonly unicode prefix_
|
||||||
cdef readonly attr_t suffix_id
|
cdef readonly unicode suffix_
|
||||||
|
|
||||||
cdef readonly attr_t cluster
|
cdef readonly attr_t cluster
|
||||||
cdef readonly float prob
|
cdef readonly float prob
|
||||||
cdef readonly float sentiment
|
cdef readonly float sentiment
|
||||||
|
|
||||||
|
# Workaround for an apparent bug in the way the decorator is handled ---
|
||||||
|
# TODO: post bug report / patch to Cython.
|
||||||
|
@staticmethod
|
||||||
|
cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings):
|
||||||
|
cdef Lexeme py = Lexeme.__new__(Lexeme, 300)
|
||||||
|
for i in range(300):
|
||||||
|
py.vec[i] = ptr.vec[i]
|
||||||
|
py.flags = ptr.flags
|
||||||
|
py.id = ptr.id
|
||||||
|
py.length = ptr.length
|
||||||
|
|
||||||
cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings)
|
py.sic = ptr.sic
|
||||||
|
py.norm1 = ptr.norm1
|
||||||
|
py.norm2 = ptr.norm2
|
||||||
|
py.shape = ptr.shape
|
||||||
|
py.prefix = ptr.prefix
|
||||||
|
py.suffix = ptr.suffix
|
||||||
|
|
||||||
|
py.sic_ = strings[ptr.sic]
|
||||||
|
py.norm1_ = strings[ptr.norm1]
|
||||||
|
py.norm2_ = strings[ptr.norm2]
|
||||||
|
py.shape_ = strings[ptr.shape]
|
||||||
|
py.prefix_ = strings[ptr.prefix]
|
||||||
|
py.suffix_ = strings[ptr.suffix]
|
||||||
|
|
||||||
|
py.cluster = ptr.cluster
|
||||||
|
py.prob = ptr.prob
|
||||||
|
py.sentiment = ptr.sentiment
|
||||||
|
return py
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||||
|
|
|
@ -7,13 +7,14 @@ from libc.string cimport memset
|
||||||
|
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
|
||||||
|
|
||||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1:
|
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
|
||||||
|
const float* empty_vec) except -1:
|
||||||
lex.length = props['length']
|
lex.length = props['length']
|
||||||
lex.sic = string_store[props['sic']]
|
lex.sic = string_store[props['sic']]
|
||||||
lex.norm1 = string_store[props['norm1']]
|
lex.norm1 = string_store[props['norm1']]
|
||||||
|
@ -27,39 +28,10 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
|
||||||
lex.sentiment = props['sentiment']
|
lex.sentiment = props['sentiment']
|
||||||
|
|
||||||
lex.flags = props['flags']
|
lex.flags = props['flags']
|
||||||
|
lex.vec = empty_vec
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
"""A dummy docstring"""
|
"""A dummy docstring"""
|
||||||
def __init__(self):
|
def __cinit__(self, int vec_size):
|
||||||
pass
|
self.vec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
|
|
||||||
cdef Lexeme py = Lexeme.__new__(Lexeme)
|
|
||||||
|
|
||||||
py.vec = c.vec
|
|
||||||
|
|
||||||
py.flags = c.flags
|
|
||||||
py.id = c.id
|
|
||||||
py.length = c.length
|
|
||||||
|
|
||||||
py.sic = c.sic
|
|
||||||
py.norm1 = strings[c.norm1]
|
|
||||||
py.norm2 = strings[c.norm2]
|
|
||||||
py.shape = strings[c.shape]
|
|
||||||
py.prefix = strings[c.prefix]
|
|
||||||
py.suffix = strings[c.suffix]
|
|
||||||
|
|
||||||
py.sic_id = c.sic
|
|
||||||
py.norm1_id = c.norm1
|
|
||||||
py.norm2_id = c.norm2
|
|
||||||
py.shape_id = c.shape
|
|
||||||
py.prefix_id = c.prefix
|
|
||||||
py.suffix_id = c.suffix
|
|
||||||
|
|
||||||
py.cluster = c.cluster
|
|
||||||
|
|
||||||
py.prob = c.prob
|
|
||||||
py.sentiment = c.sentiment
|
|
||||||
return py
|
|
||||||
|
|
|
@ -137,8 +137,42 @@ cpdef unicode word_shape(unicode string):
|
||||||
return ''.join(shape)
|
return ''.join(shape)
|
||||||
|
|
||||||
|
|
||||||
|
cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):
|
||||||
|
"""Apply level 1 normalization:
|
||||||
|
|
||||||
|
* Case is canonicalized, using frequency statistics
|
||||||
|
* Unicode mapped to ascii, via unidecode
|
||||||
|
* Regional spelling variations are normalized
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
cpdef bytes asciied(unicode string):
|
cpdef bytes asciied(unicode string):
|
||||||
cdef str stripped = unidecode(string)
|
cdef str stripped = unidecode(string)
|
||||||
if not stripped:
|
if not stripped:
|
||||||
return b'???'
|
return b'???'
|
||||||
return stripped.encode('ascii')
|
return stripped.encode('ascii')
|
||||||
|
|
||||||
|
|
||||||
|
# Exceptions --- do not convert these
|
||||||
|
_uk_us_except = set([
|
||||||
|
'our',
|
||||||
|
'ours',
|
||||||
|
'four',
|
||||||
|
'fours',
|
||||||
|
'your',
|
||||||
|
'yours',
|
||||||
|
'hour',
|
||||||
|
'hours',
|
||||||
|
'course',
|
||||||
|
'rise',
|
||||||
|
])
|
||||||
|
def uk_to_usa(unicode string):
|
||||||
|
if not string.islower():
|
||||||
|
return string
|
||||||
|
if string in _uk_us_except:
|
||||||
|
return string
|
||||||
|
our = re.compile(r'ours?$')
|
||||||
|
string = our.sub('or', string)
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
|
@ -44,12 +44,12 @@ cdef struct TokenC:
|
||||||
const LexemeC* lex
|
const LexemeC* lex
|
||||||
Morphology morph
|
Morphology morph
|
||||||
univ_tag_t pos
|
univ_tag_t pos
|
||||||
int fine_pos
|
int tag
|
||||||
int idx
|
int idx
|
||||||
int lemma
|
int lemma
|
||||||
int sense
|
int sense
|
||||||
int head
|
int head
|
||||||
int dep_tag
|
int dep
|
||||||
uint32_t l_kids
|
uint32_t l_kids
|
||||||
uint32_t r_kids
|
uint32_t r_kids
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
else:
|
else:
|
||||||
context[0] = token.lex.sic
|
context[0] = token.lex.sic
|
||||||
context[1] = token.lemma
|
context[1] = token.lemma
|
||||||
context[2] = token.fine_pos
|
context[2] = token.tag
|
||||||
context[3] = token.lex.cluster
|
context[3] = token.lex.cluster
|
||||||
# We've read in the string little-endian, so now we can take & (2**n)-1
|
# We've read in the string little-endian, so now we can take & (2**n)-1
|
||||||
# to get the first n bits of the cluster.
|
# to get the first n bits of the cluster.
|
||||||
|
@ -44,7 +44,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
# the source that are set to 1.
|
# the source that are set to 1.
|
||||||
context[4] = token.lex.cluster & 63
|
context[4] = token.lex.cluster & 63
|
||||||
context[5] = token.lex.cluster & 15
|
context[5] = token.lex.cluster & 15
|
||||||
context[6] = token.dep_tag if has_head(token) else 0
|
context[6] = token.dep if has_head(token) else 0
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, State* state) except -1:
|
cdef int fill_context(atom_t* context, State* state) except -1:
|
||||||
|
|
|
@ -12,7 +12,7 @@ DEF NON_MONOTONIC = True
|
||||||
cdef int add_dep(State *s, int head, int child, int label) except -1:
|
cdef int add_dep(State *s, int head, int child, int label) except -1:
|
||||||
cdef int dist = head - child
|
cdef int dist = head - child
|
||||||
s.sent[child].head = dist
|
s.sent[child].head = dist
|
||||||
s.sent[child].dep_tag = label
|
s.sent[child].dep = label
|
||||||
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
||||||
# offset i from it, set that bit (tracking left and right separately)
|
# offset i from it, set that bit (tracking left and right separately)
|
||||||
if child > head:
|
if child > head:
|
||||||
|
@ -38,7 +38,7 @@ cdef int push_stack(State *s) except -1:
|
||||||
if at_eol(s):
|
if at_eol(s):
|
||||||
while s.stack_len != 0:
|
while s.stack_len != 0:
|
||||||
if not has_head(get_s0(s)):
|
if not has_head(get_s0(s)):
|
||||||
get_s0(s).dep_tag = 0
|
get_s0(s).dep = 0
|
||||||
pop_stack(s)
|
pop_stack(s)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -123,7 +123,7 @@ cdef class TransitionSystem:
|
||||||
if t.move == SHIFT:
|
if t.move == SHIFT:
|
||||||
# Set the dep label, in case we need it after we reduce
|
# Set the dep label, in case we need it after we reduce
|
||||||
if NON_MONOTONIC:
|
if NON_MONOTONIC:
|
||||||
get_s0(s).dep_tag = t.label
|
get_s0(s).dep = t.label
|
||||||
push_stack(s)
|
push_stack(s)
|
||||||
elif t.move == LEFT:
|
elif t.move == LEFT:
|
||||||
add_dep(s, s.i, s.stack[0], t.label)
|
add_dep(s, s.i, s.stack[0], t.label)
|
||||||
|
@ -132,7 +132,7 @@ cdef class TransitionSystem:
|
||||||
add_dep(s, s.stack[0], s.i, t.label)
|
add_dep(s, s.stack[0], s.i, t.label)
|
||||||
push_stack(s)
|
push_stack(s)
|
||||||
elif t.move == REDUCE:
|
elif t.move == REDUCE:
|
||||||
add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag)
|
add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep)
|
||||||
pop_stack(s)
|
pop_stack(s)
|
||||||
else:
|
else:
|
||||||
raise Exception(t.move)
|
raise Exception(t.move)
|
||||||
|
|
|
@ -9,5 +9,3 @@ cdef class GreedyParser:
|
||||||
cdef object cfg
|
cdef object cfg
|
||||||
cdef readonly Model model
|
cdef readonly Model model
|
||||||
cdef TransitionSystem moves
|
cdef TransitionSystem moves
|
||||||
|
|
||||||
cpdef int parse(self, Tokens tokens) except -1
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ cdef class GreedyParser:
|
||||||
hasty_templ, full_templ = get_templates(self.cfg.features)
|
hasty_templ, full_templ = get_templates(self.cfg.features)
|
||||||
self.model = Model(self.moves.n_moves, full_templ, model_dir)
|
self.model = Model(self.moves.n_moves, full_templ, model_dir)
|
||||||
|
|
||||||
cpdef int parse(self, Tokens tokens) except -1:
|
def __call__(self, Tokens tokens):
|
||||||
cdef:
|
cdef:
|
||||||
Transition guess
|
Transition guess
|
||||||
uint64_t state_key
|
uint64_t state_key
|
||||||
|
|
|
@ -28,8 +28,6 @@ cdef class Tokenizer:
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings)
|
cpdef Tokens tokens_from_list(self, list strings)
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||||
|
|
|
@ -31,18 +31,6 @@ cdef class Tokenizer:
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._load_special_tokenization(rules, pos_tags, tag_names)
|
self._load_special_tokenization(rules, pos_tags, tag_names)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_dir(cls, Vocab vocab, object data_dir, object pos_tags, object tag_names):
|
|
||||||
if not path.exists(data_dir):
|
|
||||||
raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir)
|
|
||||||
if not path.isdir(data_dir):
|
|
||||||
raise IOError("Path %s is a file, not a dir -- cannot load Tokenizer." % data_dir)
|
|
||||||
|
|
||||||
assert path.exists(data_dir) and path.isdir(data_dir)
|
|
||||||
rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
|
|
||||||
return cls(vocab, rules, re.compile(prefix_re), re.compile(suffix_re),
|
|
||||||
re.compile(infix_re), pos_tags, tag_names)
|
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
cdef Tokens tokens = Tokens(self.vocab, length)
|
cdef Tokens tokens = Tokens(self.vocab, length)
|
||||||
|
@ -57,7 +45,7 @@ cdef class Tokenizer:
|
||||||
idx += len(py_string) + 1
|
idx += len(py_string) + 1
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode string):
|
def __call__(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
The tokenization rules are defined in three places:
|
The tokenization rules are defined in three places:
|
||||||
|
@ -257,7 +245,7 @@ cdef class Tokenizer:
|
||||||
tokens[i].lemma = self.vocab.strings[lemma]
|
tokens[i].lemma = self.vocab.strings[lemma]
|
||||||
if 'pos' in props:
|
if 'pos' in props:
|
||||||
# TODO: Clean up this mess...
|
# TODO: Clean up this mess...
|
||||||
tokens[i].fine_pos = tag_names.index(props['pos'])
|
tokens[i].tag = tag_names.index(props['pos'])
|
||||||
tokens[i].pos = tag_map[props['pos']][0]
|
tokens[i].pos = tag_map[props['pos']][0]
|
||||||
# These are defaults, which can be over-ridden by the
|
# These are defaults, which can be over-ridden by the
|
||||||
# token-specific props.
|
# token-specific props.
|
||||||
|
|
|
@ -198,9 +198,8 @@ cdef class Token:
|
||||||
self.sentiment = t.lex.sentiment
|
self.sentiment = t.lex.sentiment
|
||||||
self.flags = t.lex.flags
|
self.flags = t.lex.flags
|
||||||
self.lemma = t.lemma
|
self.lemma = t.lemma
|
||||||
self.pos = t.pos
|
self.tag = t.tag
|
||||||
self.fine_pos = t.fine_pos
|
self.dep = t.dep
|
||||||
self.dep_tag = t.dep_tag
|
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
|
@ -220,6 +219,12 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self._seq.data[self.i].lex.length
|
return self._seq.data[self.i].lex.length
|
||||||
|
|
||||||
|
def check_flag(self, attr_id_t flag):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_pos(self, univ_tag_t pos):
|
||||||
|
return False
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -267,16 +272,10 @@ cdef class Token:
|
||||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
||||||
return py_ustr
|
return py_ustr
|
||||||
|
|
||||||
property pos_:
|
property tag_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.vocab.strings[self.pos]
|
return self._seq.tag_names[self.tag]
|
||||||
|
|
||||||
property fine_pos_:
|
property dep_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.vocab.strings[self.fine_pos]
|
return self._seq.dep_names[self.dep]
|
||||||
|
|
||||||
property dep_tag_:
|
|
||||||
def __get__(self):
|
|
||||||
return self._seq.vocab.strings[self.dep_tag]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
126
spacy/vocab.pyx
126
spacy/vocab.pyx
|
@ -1,16 +1,20 @@
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
|
import bz2
|
||||||
from os import path
|
from os import path
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport set_lex_struct_props
|
from .lexeme cimport set_lex_struct_props
|
||||||
from .lexeme cimport Lexeme_cinit
|
from .lexeme cimport Lexeme
|
||||||
from .strings cimport slice_unicode
|
from .strings cimport slice_unicode
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
|
|
||||||
|
from cymem.cymem cimport Address
|
||||||
|
|
||||||
|
|
||||||
DEF MAX_VEC_SIZE = 100000
|
DEF MAX_VEC_SIZE = 100000
|
||||||
|
|
||||||
|
@ -34,12 +38,15 @@ cdef class Vocab:
|
||||||
if data_dir is not None:
|
if data_dir is not None:
|
||||||
if not path.exists(data_dir):
|
if not path.exists(data_dir):
|
||||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||||
|
assert EMPTY_LEXEME.vec != NULL
|
||||||
if data_dir is not None:
|
if data_dir is not None:
|
||||||
if not path.isdir(data_dir):
|
if not path.isdir(data_dir):
|
||||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||||
self.strings.load(path.join(data_dir, 'strings.txt'))
|
self.strings.load(path.join(data_dir, 'strings.txt'))
|
||||||
self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
|
self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
|
||||||
#self.load_vectors(path.join(data_dir, 'deps.words'))
|
self.load_vectors(path.join(data_dir, 'vec.bin'))
|
||||||
|
for i in range(self.lexemes.size()):
|
||||||
|
assert self.lexemes[i].vec != NULL, repr(self.strings[self.lexemes[i].sic])
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
|
@ -52,13 +59,15 @@ cdef class Vocab:
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
lex = <LexemeC*>self._map.get(c_str.key)
|
lex = <LexemeC*>self._map.get(c_str.key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
|
assert lex.vec != NULL
|
||||||
return lex
|
return lex
|
||||||
if c_str.n < 3:
|
if c_str.n < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
cdef unicode py_str = c_str.chars[:c_str.n]
|
cdef unicode py_str = c_str.chars[:c_str.n]
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
props = self.lexeme_props_getter(py_str)
|
props = self.lexeme_props_getter(py_str)
|
||||||
set_lex_struct_props(lex, props, self.strings)
|
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||||
|
assert lex.vec != NULL
|
||||||
if mem is self.mem:
|
if mem is self.mem:
|
||||||
lex.id = self.lexemes.size()
|
lex.id = self.lexemes.size()
|
||||||
self._add_lex_to_vocab(c_str.key, lex)
|
self._add_lex_to_vocab(c_str.key, lex)
|
||||||
|
@ -98,7 +107,7 @@ cdef class Vocab:
|
||||||
lexeme = self.get(self.mem, &c_str)
|
lexeme = self.get(self.mem, &c_str)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Vocab unable to map type: %s. Maps unicode --> int or int --> unicode" % str(type(id_or_string)))
|
raise ValueError("Vocab unable to map type: %s. Maps unicode --> int or int --> unicode" % str(type(id_or_string)))
|
||||||
return Lexeme_cinit(lexeme, self.strings)
|
return Lexeme.from_ptr(lexeme, self.strings)
|
||||||
|
|
||||||
def __setitem__(self, unicode py_str, dict props):
|
def __setitem__(self, unicode py_str, dict props):
|
||||||
cdef UniStr c_str
|
cdef UniStr c_str
|
||||||
|
@ -109,7 +118,8 @@ cdef class Vocab:
|
||||||
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
lex.id = self.lexemes.size()
|
lex.id = self.lexemes.size()
|
||||||
self._add_lex_to_vocab(c_str.key, lex)
|
self._add_lex_to_vocab(c_str.key, lex)
|
||||||
set_lex_struct_props(lex, props, self.strings)
|
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||||
|
assert lex.vec != NULL
|
||||||
assert lex.sic < 1000000
|
assert lex.sic < 1000000
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
|
@ -147,8 +157,9 @@ cdef class Vocab:
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
lexeme.vec = EMPTY_VEC
|
# Copies data from the file into the lexeme
|
||||||
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
||||||
|
lexeme.vec = EMPTY_VEC
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
self._map.set(key, lexeme)
|
self._map.set(key, lexeme)
|
||||||
|
@ -159,27 +170,96 @@ cdef class Vocab:
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
||||||
def load_vectors(self, loc):
|
def load_vectors(self, loc):
|
||||||
cdef int i
|
file_ = _CFile(loc, 'rb')
|
||||||
cdef unicode line
|
cdef int32_t word_len
|
||||||
cdef unicode word
|
cdef int32_t vec_len
|
||||||
cdef unicode val_str
|
|
||||||
cdef hash_t key
|
|
||||||
cdef LexemeC* lex
|
|
||||||
cdef float* vec
|
cdef float* vec
|
||||||
|
cdef Address mem
|
||||||
|
cdef id_t string_id
|
||||||
|
cdef bytes py_word
|
||||||
|
cdef vector[float*] vectors
|
||||||
|
cdef int i
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
file_.read(&word_len, sizeof(word_len), 1)
|
||||||
|
except IOError:
|
||||||
|
break
|
||||||
|
file_.read(&vec_len, sizeof(vec_len), 1)
|
||||||
|
|
||||||
with codecs.open(loc, 'r', 'utf8') as file_:
|
mem = Address(word_len, sizeof(char))
|
||||||
|
chars = <char*>mem.ptr
|
||||||
|
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
|
||||||
|
|
||||||
|
file_.read(chars, sizeof(char), word_len)
|
||||||
|
file_.read(vec, sizeof(float), vec_len)
|
||||||
|
|
||||||
|
string_id = self.strings[chars[:word_len]]
|
||||||
|
while string_id >= vectors.size():
|
||||||
|
vectors.push_back(EMPTY_VEC)
|
||||||
|
assert vec != NULL
|
||||||
|
vectors[string_id] = vec
|
||||||
|
cdef LexemeC* lex
|
||||||
|
for i in range(self.lexemes.size()):
|
||||||
|
# Cast away the const, cos we can modify our lexemes
|
||||||
|
lex = <LexemeC*>self.lexemes[i]
|
||||||
|
if lex.sic < vectors.size():
|
||||||
|
lex.vec = vectors[lex.sic]
|
||||||
|
else:
|
||||||
|
lex.vec = EMPTY_VEC
|
||||||
|
assert lex.vec != NULL
|
||||||
|
|
||||||
|
|
||||||
|
def write_binary_vectors(in_loc, out_loc):
|
||||||
|
cdef _CFile out_file = _CFile(out_loc, 'wb')
|
||||||
|
cdef Address mem
|
||||||
|
cdef int32_t word_len
|
||||||
|
cdef int32_t vec_len
|
||||||
|
cdef char* chars
|
||||||
|
with bz2.BZ2File(in_loc, 'r') as file_:
|
||||||
for line in file_:
|
for line in file_:
|
||||||
pieces = line.split()
|
pieces = line.split()
|
||||||
word = pieces.pop(0)
|
word = pieces.pop(0)
|
||||||
if len(pieces) >= MAX_VEC_SIZE:
|
mem = Address(len(pieces), sizeof(float))
|
||||||
sizes = (len(pieces), MAX_VEC_SIZE)
|
vec = <float*>mem.ptr
|
||||||
msg = ("Your vector is %d elements."
|
|
||||||
"The compile-time limit is %d elements." % sizes)
|
|
||||||
raise ValueError(msg)
|
|
||||||
key = hash_string(word)
|
|
||||||
lex = <LexemeC*>self._map.get(key)
|
|
||||||
if lex is not NULL:
|
|
||||||
vec = <float*>self.mem.alloc(len(pieces), sizeof(float))
|
|
||||||
for i, val_str in enumerate(pieces):
|
for i, val_str in enumerate(pieces):
|
||||||
vec[i] = float(val_str)
|
vec[i] = float(val_str)
|
||||||
lex.vec = vec
|
|
||||||
|
word_len = len(word)
|
||||||
|
vec_len = len(pieces)
|
||||||
|
|
||||||
|
out_file.write(sizeof(word_len), 1, &word_len)
|
||||||
|
out_file.write(sizeof(vec_len), 1, &vec_len)
|
||||||
|
|
||||||
|
chars = <char*>word
|
||||||
|
out_file.write(sizeof(char), len(word), chars)
|
||||||
|
out_file.write(sizeof(float), vec_len, vec)
|
||||||
|
|
||||||
|
|
||||||
|
cdef class _CFile:
|
||||||
|
cdef FILE* fp
|
||||||
|
def __init__(self, loc, mode):
|
||||||
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
|
self.fp = fopen(<char*>bytes_loc, mode)
|
||||||
|
if self.fp == NULL:
|
||||||
|
raise IOError
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
fclose(self.fp)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
fclose(self.fp)
|
||||||
|
|
||||||
|
cdef int read(self, void* dest, size_t elem_size, size_t n) except -1:
|
||||||
|
st = fread(dest, elem_size, n, self.fp)
|
||||||
|
if st != n:
|
||||||
|
raise IOError
|
||||||
|
|
||||||
|
cdef int write(self, size_t elem_size, size_t n, void* data) except -1:
|
||||||
|
st = fwrite(data, elem_size, n, self.fp)
|
||||||
|
if st != n:
|
||||||
|
raise IOError
|
||||||
|
|
||||||
|
cdef int write_unicode(self, unicode value):
|
||||||
|
cdef bytes py_bytes = value.encode('utf8')
|
||||||
|
cdef char* chars = <char*>py_bytes
|
||||||
|
self.write(sizeof(char), len(py_bytes), chars)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user