mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
Merge branch 'master' of https://github.com/honnibal/spaCy
This commit is contained in:
commit
2e6a60eaec
17
.travis.yml
17
.travis.yml
|
@ -11,11 +11,18 @@ python:
|
||||||
# install dependencies
|
# install dependencies
|
||||||
install:
|
install:
|
||||||
- "pip install --upgrade setuptools"
|
- "pip install --upgrade setuptools"
|
||||||
- "rm -rf spacy/"
|
- "pip install cython fabric fabtools"
|
||||||
- "pip install spacy"
|
- "pip install -r requirements.txt"
|
||||||
|
- "python setup.py build_ext --inplace"
|
||||||
|
- "mkdir -p corpora/en"
|
||||||
|
- "cd corpora/en"
|
||||||
|
- "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
|
||||||
|
- "tar -xzf WordNet-3.0.tar.gz"
|
||||||
|
- "mv WordNet-3.0 wordnet"
|
||||||
|
- "cd ../../"
|
||||||
|
- "export PYTHONPATH=`pwd`"
|
||||||
|
- "python bin/init_model.py lang_data/en corpora/en spacy/en/data"
|
||||||
|
|
||||||
# run tests
|
# run tests
|
||||||
script:
|
script:
|
||||||
- py.test tests/tokenizer/
|
- "py.test tests/ -x"
|
||||||
- py.test tests/vocab/
|
|
||||||
- py.test tests/tagger/
|
|
||||||
|
|
27
bin/gather_freqs.py
Normal file
27
bin/gather_freqs.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import plac
|
||||||
|
|
||||||
|
def main(in_loc, out_loc):
|
||||||
|
out_file = open(out_loc, 'w')
|
||||||
|
this_key = None
|
||||||
|
this_freq = 0
|
||||||
|
df = 0
|
||||||
|
for line in open(in_loc):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
freq, key = line.split('\t', 1)
|
||||||
|
freq = int(freq)
|
||||||
|
if this_key is not None and key != this_key:
|
||||||
|
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
|
||||||
|
this_key = key
|
||||||
|
this_freq = freq
|
||||||
|
df = 1
|
||||||
|
else:
|
||||||
|
this_freq += freq
|
||||||
|
df += 1
|
||||||
|
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
|
||||||
|
out_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
|
@ -15,6 +15,8 @@ Requires:
|
||||||
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
|
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
|
||||||
* vectors.tgz --- output of something like word2vec
|
* vectors.tgz --- output of something like word2vec
|
||||||
"""
|
"""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):
|
||||||
|
|
||||||
def _read_clusters(loc):
|
def _read_clusters(loc):
|
||||||
if not loc.exists():
|
if not loc.exists():
|
||||||
print "Warning: Clusters file not found"
|
print("Warning: Clusters file not found")
|
||||||
return {}
|
return {}
|
||||||
clusters = {}
|
clusters = {}
|
||||||
for line in codecs.open(str(loc), 'r', 'utf8'):
|
for line in codecs.open(str(loc), 'r', 'utf8'):
|
||||||
|
@ -60,7 +62,7 @@ def _read_clusters(loc):
|
||||||
else:
|
else:
|
||||||
clusters[word] = '0'
|
clusters[word] = '0'
|
||||||
# Expand clusters with re-casing
|
# Expand clusters with re-casing
|
||||||
for word, cluster in clusters.items():
|
for word, cluster in list(clusters.items()):
|
||||||
if word.lower() not in clusters:
|
if word.lower() not in clusters:
|
||||||
clusters[word.lower()] = cluster
|
clusters[word.lower()] = cluster
|
||||||
if word.title() not in clusters:
|
if word.title() not in clusters:
|
||||||
|
@ -72,7 +74,7 @@ def _read_clusters(loc):
|
||||||
|
|
||||||
def _read_probs(loc):
|
def _read_probs(loc):
|
||||||
if not loc.exists():
|
if not loc.exists():
|
||||||
print "Warning: Probabilities file not found"
|
print("Warning: Probabilities file not found")
|
||||||
return {}
|
return {}
|
||||||
probs = {}
|
probs = {}
|
||||||
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
|
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
|
||||||
|
@ -85,7 +87,7 @@ def _read_probs(loc):
|
||||||
def _read_senses(loc):
|
def _read_senses(loc):
|
||||||
lexicon = defaultdict(lambda: defaultdict(list))
|
lexicon = defaultdict(lambda: defaultdict(list))
|
||||||
if not loc.exists():
|
if not loc.exists():
|
||||||
print "Warning: WordNet senses not found"
|
print("Warning: WordNet senses not found")
|
||||||
return lexicon
|
return lexicon
|
||||||
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
|
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
|
||||||
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
|
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
|
||||||
|
@ -109,13 +111,20 @@ def setup_vocab(src_dir, dst_dir):
|
||||||
if vectors_src.exists():
|
if vectors_src.exists():
|
||||||
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
|
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
|
||||||
else:
|
else:
|
||||||
print "Warning: Word vectors file not found"
|
print("Warning: Word vectors file not found")
|
||||||
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
||||||
clusters = _read_clusters(src_dir / 'clusters.txt')
|
clusters = _read_clusters(src_dir / 'clusters.txt')
|
||||||
probs = _read_probs(src_dir / 'words.sgt.prob')
|
probs = _read_probs(src_dir / 'words.sgt.prob')
|
||||||
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
|
if not probs:
|
||||||
|
min_prob = 0.0
|
||||||
|
else:
|
||||||
|
min_prob = min(probs.values())
|
||||||
|
for word in clusters:
|
||||||
|
if word not in probs:
|
||||||
|
probs[word] = min_prob
|
||||||
|
|
||||||
lexicon = []
|
lexicon = []
|
||||||
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
|
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||||
entry = get_lex_props(word)
|
entry = get_lex_props(word)
|
||||||
if word in clusters or float(prob) >= -17:
|
if word in clusters or float(prob) >= -17:
|
||||||
entry['prob'] = float(prob)
|
entry['prob'] = float(prob)
|
||||||
|
@ -144,7 +153,7 @@ def main(lang_data_dir, corpora_dir, model_dir):
|
||||||
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
|
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
|
||||||
setup_vocab(corpora_dir, model_dir / 'vocab')
|
setup_vocab(corpora_dir, model_dir / 'vocab')
|
||||||
if not (model_dir / 'wordnet').exists():
|
if not (model_dir / 'wordnet').exists():
|
||||||
copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
|
copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from os import path
|
from os import path
|
||||||
|
@ -107,7 +108,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
|
|
||||||
nlp = Language(data_dir=model_dir)
|
nlp = Language(data_dir=model_dir)
|
||||||
|
|
||||||
print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
|
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
loss = 0
|
loss = 0
|
||||||
|
@ -138,9 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
nlp.entity.train(tokens, gold)
|
nlp.entity.train(tokens, gold)
|
||||||
nlp.tagger.train(tokens, gold.tags)
|
nlp.tagger.train(tokens, gold.tags)
|
||||||
random.shuffle(gold_tuples)
|
random.shuffle(gold_tuples)
|
||||||
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||||
scorer.tags_acc,
|
scorer.tags_acc,
|
||||||
scorer.token_acc)
|
scorer.token_acc))
|
||||||
nlp.end_training()
|
nlp.end_training()
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||||
|
@ -219,14 +220,14 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
|
||||||
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
||||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
||||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||||
print 'TOK', scorer.token_acc
|
print('TOK', scorer.token_acc)
|
||||||
print 'POS', scorer.tags_acc
|
print('POS', scorer.tags_acc)
|
||||||
print 'UAS', scorer.uas
|
print('UAS', scorer.uas)
|
||||||
print 'LAS', scorer.las
|
print('LAS', scorer.las)
|
||||||
|
|
||||||
print 'NER P', scorer.ents_p
|
print('NER P', scorer.ents_p)
|
||||||
print 'NER R', scorer.ents_r
|
print('NER R', scorer.ents_r)
|
||||||
print 'NER F', scorer.ents_f
|
print('NER F', scorer.ents_f)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
316709
corpora/en/clusters.txt
Normal file
316709
corpora/en/clusters.txt
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -2,7 +2,7 @@ cython
|
||||||
cymem == 1.11
|
cymem == 1.11
|
||||||
pathlib
|
pathlib
|
||||||
preshed == 0.37
|
preshed == 0.37
|
||||||
thinc == 3.2
|
thinc == 3.3
|
||||||
murmurhash == 0.24
|
murmurhash == 0.24
|
||||||
unidecode
|
unidecode
|
||||||
numpy
|
numpy
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -120,7 +120,7 @@ def run_setup(exts):
|
||||||
ext_modules=exts,
|
ext_modules=exts,
|
||||||
license="Dual: Commercial or AGPL",
|
license="Dual: Commercial or AGPL",
|
||||||
install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
|
install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
|
||||||
'thinc == 3.2', "unidecode", 'wget', 'plac', 'six',
|
'thinc == 3.3', "unidecode", 'wget', 'plac', 'six',
|
||||||
'ujson'],
|
'ujson'],
|
||||||
setup_requires=["headers_workaround"],
|
setup_requires=["headers_workaround"],
|
||||||
)
|
)
|
||||||
|
@ -162,6 +162,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
||||||
'spacy.gold', 'spacy.orth',
|
'spacy.gold', 'spacy.orth',
|
||||||
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
||||||
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
||||||
|
'spacy.cfile',
|
||||||
'spacy.syntax.ner']
|
'spacy.syntax.ner']
|
||||||
|
|
||||||
|
|
||||||
|
|
12
spacy/cfile.pxd
Normal file
12
spacy/cfile.pxd
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
cdef class CFile:
|
||||||
|
cdef FILE* fp
|
||||||
|
cdef bint is_open
|
||||||
|
|
||||||
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
40
spacy/cfile.pyx
Normal file
40
spacy/cfile.pyx
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
|
|
||||||
|
|
||||||
|
cdef class CFile:
|
||||||
|
def __init__(self, loc, mode):
|
||||||
|
if isinstance(mode, unicode):
|
||||||
|
mode_str = mode.encode('ascii')
|
||||||
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
|
self.fp = fopen(<char*>bytes_loc, mode_str)
|
||||||
|
if self.fp == NULL:
|
||||||
|
raise IOError("Could not open binary file %s" % bytes_loc)
|
||||||
|
self.is_open = True
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
if self.is_open:
|
||||||
|
fclose(self.fp)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
fclose(self.fp)
|
||||||
|
self.is_open = False
|
||||||
|
|
||||||
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||||
|
st = fread(dest, elem_size, number, self.fp)
|
||||||
|
if st != number:
|
||||||
|
raise IOError
|
||||||
|
|
||||||
|
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
|
||||||
|
st = fwrite(src, elem_size, number, self.fp)
|
||||||
|
if st != number:
|
||||||
|
raise IOError
|
||||||
|
|
||||||
|
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||||
|
cdef void* dest = mem.alloc(number, elem_size)
|
||||||
|
self.read_into(dest, number, elem_size)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
def write_unicode(self, unicode value):
|
||||||
|
cdef bytes py_bytes = value.encode('utf8')
|
||||||
|
cdef char* chars = <char*>py_bytes
|
||||||
|
self.write(sizeof(char), len(py_bytes), chars)
|
|
@ -95,15 +95,15 @@ class English(object):
|
||||||
|
|
||||||
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
|
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
|
||||||
|
|
||||||
if Tagger:
|
if Tagger and path.exists(path.join(data_dir, 'pos')):
|
||||||
self.tagger = Tagger(self.vocab.strings, data_dir)
|
self.tagger = Tagger(self.vocab.strings, data_dir)
|
||||||
else:
|
else:
|
||||||
self.tagger = None
|
self.tagger = None
|
||||||
if Parser:
|
if Parser and path.exists(path.join(data_dir, 'deps')):
|
||||||
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
|
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
|
||||||
else:
|
else:
|
||||||
self.parser = None
|
self.parser = None
|
||||||
if Entity:
|
if Entity and path.exists(path.join(data_dir, 'ner')):
|
||||||
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
|
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
|
||||||
else:
|
else:
|
||||||
self.entity = None
|
self.entity = None
|
||||||
|
@ -153,15 +153,14 @@ class English(object):
|
||||||
self.tagger.model.end_training()
|
self.tagger.model.end_training()
|
||||||
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||||
|
|
||||||
packer = Packer(self.vocab, [
|
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
||||||
(TAG, self.tagger.moves.freqs[TAG].items()),
|
file_.write(
|
||||||
(HEAD, self.parser.moves.freqs[HEAD].items()),
|
json.dumps([
|
||||||
(DEP, self.parser.moves.freqs[DEP].items()),
|
(TAG, self.tagger.freqs[TAG].items()),
|
||||||
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
|
(DEP, self.parser.moves.freqs[DEP].items()),
|
||||||
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items())
|
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
|
||||||
])
|
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()),
|
||||||
|
(HEAD, self.parser.moves.freqs[HEAD].items())]))
|
||||||
packer.dump(path.join(data_dir, 'vocab'))
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tags(self):
|
def tags(self):
|
||||||
|
|
|
@ -14,6 +14,9 @@ from ..attrs cimport LEMMA as _LEMMA
|
||||||
from ..attrs cimport POS as _POS
|
from ..attrs cimport POS as _POS
|
||||||
from ..attrs cimport TAG as _TAG
|
from ..attrs cimport TAG as _TAG
|
||||||
from ..attrs cimport DEP as _DEP
|
from ..attrs cimport DEP as _DEP
|
||||||
|
from ..attrs cimport HEAD as _HEAD
|
||||||
|
from ..attrs cimport ENT_IOB as _ENT_IOB
|
||||||
|
from ..attrs cimport ENT_TYPE as _ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
|
|
|
@ -262,6 +262,9 @@ cdef class EnPosTagger:
|
||||||
'morphs.json'))))
|
'morphs.json'))))
|
||||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||||
self.freqs = {TAG: defaultdict(int)}
|
self.freqs = {TAG: defaultdict(int)}
|
||||||
|
for tag in self.tag_names:
|
||||||
|
self.freqs[TAG][self.strings[tag]] = 1
|
||||||
|
self.freqs[TAG][0] = 1
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
|
|
@ -3,8 +3,6 @@ from cymem.cymem cimport Pool
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
from .syntax.transition_system cimport Transition
|
from .syntax.transition_system cimport Transition
|
||||||
|
|
||||||
cimport numpy
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct GoldParseC:
|
cdef struct GoldParseC:
|
||||||
int* tags
|
int* tags
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
import numpy
|
import numpy
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
|
||||||
import ujson
|
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
@ -9,6 +7,11 @@ from os import path
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
|
try:
|
||||||
|
import ujson as json
|
||||||
|
except ImportError:
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
entities = []
|
entities = []
|
||||||
|
@ -128,7 +131,7 @@ def read_json_file(loc, docs_filter=None):
|
||||||
yield from read_json_file(path.join(loc, filename))
|
yield from read_json_file(path.join(loc, filename))
|
||||||
else:
|
else:
|
||||||
with open(loc) as file_:
|
with open(loc) as file_:
|
||||||
docs = ujson.load(file_)
|
docs = json.load(file_)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef class BitArray:
|
cdef class BitArray:
|
||||||
cdef bytes data
|
cdef bytearray data
|
||||||
cdef uchar byte
|
cdef uchar byte
|
||||||
cdef uchar bit_of_byte
|
cdef uchar bit_of_byte
|
||||||
cdef uint32_t i
|
cdef uint32_t i
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
|
||||||
# Note that we're setting the most significant bits here first, when in practice
|
# Note that we're setting the most significant bits here first, when in practice
|
||||||
|
@ -15,7 +17,7 @@ cdef Code bit_append(Code code, bint bit) nogil:
|
||||||
|
|
||||||
cdef class BitArray:
|
cdef class BitArray:
|
||||||
def __init__(self, data=b''):
|
def __init__(self, data=b''):
|
||||||
self.data = data
|
self.data = bytearray(data)
|
||||||
self.byte = 0
|
self.byte = 0
|
||||||
self.bit_of_byte = 0
|
self.bit_of_byte = 0
|
||||||
self.i = 0
|
self.i = 0
|
||||||
|
@ -45,7 +47,7 @@ cdef class BitArray:
|
||||||
start_bit = self.i % 8
|
start_bit = self.i % 8
|
||||||
|
|
||||||
if start_bit != 0 and start_byte < len(self.data):
|
if start_bit != 0 and start_byte < len(self.data):
|
||||||
byte = ord(self.data[start_byte])
|
byte = self.data[start_byte]
|
||||||
for i in range(start_bit, 8):
|
for i in range(start_bit, 8):
|
||||||
self.i += 1
|
self.i += 1
|
||||||
yield 1 if (byte & (one << i)) else 0
|
yield 1 if (byte & (one << i)) else 0
|
||||||
|
@ -68,18 +70,24 @@ cdef class BitArray:
|
||||||
|
|
||||||
# TODO portability
|
# TODO portability
|
||||||
cdef uchar[4] chars
|
cdef uchar[4] chars
|
||||||
chars[0] = <uchar>ord(self.data[start_byte])
|
chars[0] = self.data[start_byte]
|
||||||
chars[1] = <uchar>ord(self.data[start_byte+1])
|
chars[1] = self.data[start_byte+1]
|
||||||
chars[2] = <uchar>ord(self.data[start_byte+2])
|
chars[2] = self.data[start_byte+2]
|
||||||
chars[3] = <uchar>ord(self.data[start_byte+3])
|
chars[3] = self.data[start_byte+3]
|
||||||
cdef uint32_t output
|
cdef uint32_t output
|
||||||
memcpy(&output, chars, 4)
|
memcpy(&output, chars, 4)
|
||||||
self.i += 32
|
self.i += 32
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def as_bytes(self):
|
def as_bytes(self):
|
||||||
|
cdef unsigned char byte_char
|
||||||
if self.bit_of_byte != 0:
|
if self.bit_of_byte != 0:
|
||||||
return self.data + chr(self.byte)
|
byte = chr(self.byte)
|
||||||
|
# Jump through some hoops for Python3
|
||||||
|
if isinstance(byte, unicode):
|
||||||
|
return self.data + <bytes>(&self.byte)[:1]
|
||||||
|
else:
|
||||||
|
return self.data + chr(self.byte)
|
||||||
else:
|
else:
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
|
@ -92,7 +100,7 @@ cdef class BitArray:
|
||||||
self.bit_of_byte += 1
|
self.bit_of_byte += 1
|
||||||
self.i += 1
|
self.i += 1
|
||||||
if self.bit_of_byte == 8:
|
if self.bit_of_byte == 8:
|
||||||
self.data += chr(self.byte)
|
self.data += bytearray((self.byte,))
|
||||||
self.byte = 0
|
self.byte = 0
|
||||||
self.bit_of_byte = 0
|
self.bit_of_byte = 0
|
||||||
|
|
||||||
|
@ -106,7 +114,7 @@ cdef class BitArray:
|
||||||
self.byte &= ~(one << self.bit_of_byte)
|
self.byte &= ~(one << self.bit_of_byte)
|
||||||
self.bit_of_byte += 1
|
self.bit_of_byte += 1
|
||||||
if self.bit_of_byte == 8:
|
if self.bit_of_byte == 8:
|
||||||
self.data += chr(self.byte)
|
self.data += <bytes>self.byte
|
||||||
self.byte = 0
|
self.byte = 0
|
||||||
self.bit_of_byte = 0
|
self.bit_of_byte = 0
|
||||||
self.i += 1
|
self.i += 1
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
from __future__ import unicode_literals
|
||||||
cimport cython
|
cimport cython
|
||||||
from libcpp.queue cimport priority_queue
|
from libcpp.queue cimport priority_queue
|
||||||
from libcpp.pair cimport pair
|
from libcpp.pair cimport pair
|
||||||
|
@ -110,14 +111,14 @@ cdef class HuffmanCodec:
|
||||||
cdef int branch
|
cdef int branch
|
||||||
|
|
||||||
cdef int n_msg = msg.shape[0]
|
cdef int n_msg = msg.shape[0]
|
||||||
cdef bytes bytes_ = bits.as_bytes()
|
cdef bytearray bytes_ = bits.as_bytes()
|
||||||
cdef unsigned char byte
|
cdef unsigned char byte
|
||||||
cdef int i_msg = 0
|
cdef int i_msg = 0
|
||||||
cdef int i_byte = bits.i // 8
|
cdef int i_byte = bits.i // 8
|
||||||
cdef unsigned char i_bit = 0
|
cdef unsigned char i_bit = 0
|
||||||
cdef unsigned char one = 1
|
cdef unsigned char one = 1
|
||||||
while i_msg < n_msg:
|
while i_msg < n_msg:
|
||||||
byte = ord(bytes_[i_byte])
|
byte = bytes_[i_byte]
|
||||||
i_byte += 1
|
i_byte += 1
|
||||||
for i_bit in range(8):
|
for i_bit in range(8):
|
||||||
branch = node.right if (byte & (one << i_bit)) else node.left
|
branch = node.right if (byte & (one << i_bit)) else node.left
|
||||||
|
@ -138,11 +139,11 @@ cdef class HuffmanCodec:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
output = []
|
output = []
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef bytes string
|
cdef unicode string
|
||||||
cdef Code code
|
cdef Code code
|
||||||
for i in range(self.codes.size()):
|
for i in range(self.codes.size()):
|
||||||
code = self.codes[i]
|
code = self.codes[i]
|
||||||
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
|
string = '{0:b}'.format(code.bits).rjust(code.length, '0')
|
||||||
string = string[::-1]
|
string = string[::-1]
|
||||||
output.append(string)
|
output.append(string)
|
||||||
return output
|
return output
|
||||||
|
|
|
@ -10,6 +10,7 @@ from libcpp.pair cimport pair
|
||||||
from cymem.cymem cimport Address, Pool
|
from cymem.cymem cimport Address, Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from preshed.counter cimport PreshCounter
|
from preshed.counter cimport PreshCounter
|
||||||
|
import json
|
||||||
|
|
||||||
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
@ -65,7 +66,7 @@ def _gen_orths(Vocab vocab):
|
||||||
def _gen_chars(Vocab vocab):
|
def _gen_chars(Vocab vocab):
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
char_weights = {chr(i): 1e-20 for i in range(256)}
|
char_weights = {i: 1e-20 for i in range(256)}
|
||||||
cdef unicode string
|
cdef unicode string
|
||||||
cdef bytes char
|
cdef bytes char
|
||||||
cdef bytes utf8_str
|
cdef bytes utf8_str
|
||||||
|
@ -74,9 +75,9 @@ def _gen_chars(Vocab vocab):
|
||||||
string = vocab.strings[lex.orth]
|
string = vocab.strings[lex.orth]
|
||||||
utf8_str = string.encode('utf8')
|
utf8_str = string.encode('utf8')
|
||||||
for char in utf8_str:
|
for char in utf8_str:
|
||||||
char_weights.setdefault(char, 0.0)
|
char_weights.setdefault(ord(char), 0.0)
|
||||||
char_weights[char] += c_exp(lex.prob)
|
char_weights[ord(char)] += c_exp(lex.prob)
|
||||||
char_weights[b' '] += c_exp(lex.prob)
|
char_weights[ord(' ')] += c_exp(lex.prob)
|
||||||
return char_weights.items()
|
return char_weights.items()
|
||||||
|
|
||||||
|
|
||||||
|
@ -98,33 +99,34 @@ cdef class Packer:
|
||||||
self._codecs = tuple(codecs)
|
self._codecs = tuple(codecs)
|
||||||
self.attrs = tuple(attrs)
|
self.attrs = tuple(attrs)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_dir(cls, Vocab vocab, data_dir):
|
|
||||||
return cls(vocab, util.read_encoding_freqs(data_dir))
|
|
||||||
|
|
||||||
def pack(self, Doc doc):
|
def pack(self, Doc doc):
|
||||||
bits = self._orth_encode(doc)
|
bits = self._orth_encode(doc)
|
||||||
if bits is None:
|
if bits is None:
|
||||||
bits = self._char_encode(doc)
|
bits = self._char_encode(doc)
|
||||||
|
|
||||||
cdef int i
|
cdef int i
|
||||||
if self.attrs:
|
if self.attrs:
|
||||||
array = doc.to_array(self.attrs)
|
array = doc.to_array(self.attrs)
|
||||||
for i, codec in enumerate(self._codecs):
|
for i, codec in enumerate(self._codecs):
|
||||||
codec.encode_int32(array[:, i], bits)
|
codec.encode(array[:, i], bits)
|
||||||
return bits
|
return bits.as_bytes()
|
||||||
|
|
||||||
def unpack(self, BitArray bits):
|
def unpack(self, data):
|
||||||
|
doc = Doc(self.vocab)
|
||||||
|
self.unpack_into(data, doc)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def unpack_into(self, byte_string, Doc doc):
|
||||||
|
bits = BitArray(byte_string)
|
||||||
bits.seek(0)
|
bits.seek(0)
|
||||||
cdef int32_t length = bits.read32()
|
cdef int32_t length = bits.read32()
|
||||||
if length >= 0:
|
if length >= 0:
|
||||||
doc = self._orth_decode(bits, length)
|
self._orth_decode(bits, length, doc)
|
||||||
else:
|
else:
|
||||||
doc = self._char_decode(bits, -length)
|
self._char_decode(bits, -length, doc)
|
||||||
|
|
||||||
array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
|
array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
|
||||||
for i, codec in enumerate(self._codecs):
|
for i, codec in enumerate(self._codecs):
|
||||||
codec.decode_int32(bits, array[:, i])
|
codec.decode(bits, array[:, i])
|
||||||
|
|
||||||
doc.from_array(self.attrs, array)
|
doc.from_array(self.attrs, array)
|
||||||
return doc
|
return doc
|
||||||
|
@ -141,20 +143,13 @@ cdef class Packer:
|
||||||
bits.append(bool(token.whitespace_))
|
bits.append(bool(token.whitespace_))
|
||||||
return bits
|
return bits
|
||||||
|
|
||||||
def _orth_decode(self, BitArray bits, n):
|
|
||||||
orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
|
|
||||||
self.orth_codec.decode_int32(bits, orths)
|
|
||||||
orths_and_spaces = zip(orths, bits)
|
|
||||||
cdef Doc doc = Doc(self.vocab, orths_and_spaces)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def _char_encode(self, Doc doc):
|
def _char_encode(self, Doc doc):
|
||||||
cdef bytes utf8_str = doc.string.encode('utf8')
|
cdef bytes utf8_str = doc.string.encode('utf8')
|
||||||
cdef BitArray bits = BitArray()
|
cdef BitArray bits = BitArray()
|
||||||
cdef int32_t length = len(utf8_str)
|
cdef int32_t length = len(utf8_str)
|
||||||
# Signal chars with negative length
|
# Signal chars with negative length
|
||||||
bits.extend(-length, 32)
|
bits.extend(-length, 32)
|
||||||
self.char_codec.encode(utf8_str, bits)
|
self.char_codec.encode(bytearray(utf8_str), bits)
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
for j in range(doc.data[i].lex.length-1):
|
for j in range(doc.data[i].lex.length-1):
|
||||||
|
@ -164,12 +159,24 @@ cdef class Packer:
|
||||||
bits.append(False)
|
bits.append(False)
|
||||||
return bits
|
return bits
|
||||||
|
|
||||||
def _char_decode(self, BitArray bits, n):
|
def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
|
||||||
|
cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
|
||||||
|
self.orth_codec.decode_int32(bits, orths)
|
||||||
|
cdef int i
|
||||||
|
cdef bint space
|
||||||
|
spaces = iter(bits)
|
||||||
|
for i in range(n):
|
||||||
|
orth = orths[i]
|
||||||
|
space = next(spaces)
|
||||||
|
lex = self.vocab.get_by_orth(doc.mem, orth)
|
||||||
|
doc.push_back(lex, space)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def _char_decode(self, BitArray bits, int32_t n, Doc doc):
|
||||||
cdef bytearray utf8_str = bytearray(n)
|
cdef bytearray utf8_str = bytearray(n)
|
||||||
self.char_codec.decode(bits, utf8_str)
|
self.char_codec.decode(bits, utf8_str)
|
||||||
|
|
||||||
cdef unicode string = utf8_str.decode('utf8')
|
cdef unicode string = utf8_str.decode('utf8')
|
||||||
cdef Doc tokens = Doc(self.vocab)
|
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef bint is_spacy
|
cdef bint is_spacy
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
|
@ -178,11 +185,11 @@ cdef class Packer:
|
||||||
for is_end_token in bits:
|
for is_end_token in bits:
|
||||||
if is_end_token:
|
if is_end_token:
|
||||||
span = string[start:i+1]
|
span = string[start:i+1]
|
||||||
lex = self.vocab.get(tokens.mem, span)
|
lex = self.vocab.get(doc.mem, span)
|
||||||
is_spacy = (i+1) < length and string[i+1] == u' '
|
is_spacy = (i+1) < length and string[i+1] == u' '
|
||||||
tokens.push_back(lex, is_spacy)
|
doc.push_back(lex, is_spacy)
|
||||||
start = i + 1 + is_spacy
|
start = i + 1 + is_spacy
|
||||||
i += 1
|
i += 1
|
||||||
if i >= n:
|
if i >= n:
|
||||||
break
|
break
|
||||||
return tokens
|
return doc
|
||||||
|
|
|
@ -81,6 +81,7 @@ cdef class StringStore:
|
||||||
def __getitem__(self, object string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
cdef bytes byte_string
|
cdef bytes byte_string
|
||||||
cdef const Utf8Str* utf8str
|
cdef const Utf8Str* utf8str
|
||||||
|
cdef int id_
|
||||||
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
||||||
if string_or_id == 0:
|
if string_or_id == 0:
|
||||||
return u''
|
return u''
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
"""
|
"""
|
||||||
Fill an array, context, with every _atomic_ value our features reference.
|
Fill an array, context, with every _atomic_ value our features reference.
|
||||||
We then write the _actual features_ as tuples of the atoms. The machinery
|
We then write the _actual features_ as tuples of the atoms. The machinery
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import ctypes
|
import ctypes
|
||||||
|
|
|
@ -85,6 +85,9 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
elif gold.c.ner[i].move == OUT:
|
elif gold.c.ner[i].move == OUT:
|
||||||
self.freqs[ENT_IOB][1] += 1
|
self.freqs[ENT_IOB][1] += 1
|
||||||
self.freqs[ENT_TYPE][0] += 1
|
self.freqs[ENT_TYPE][0] += 1
|
||||||
|
else:
|
||||||
|
self.freqs[ENT_IOB][1] += 1
|
||||||
|
self.freqs[ENT_TYPE][0] += 1
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
if name == '-':
|
if name == '-':
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
"""
|
"""
|
||||||
MALT-style dependency parser
|
MALT-style dependency parser
|
||||||
"""
|
"""
|
||||||
|
@ -85,18 +84,17 @@ cdef class Parser:
|
||||||
|
|
||||||
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
|
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
|
||||||
self.model.n_feats, self.model.n_feats)
|
self.model.n_feats, self.model.n_feats)
|
||||||
self.parse(stcls, eg.c)
|
with nogil:
|
||||||
|
self.parse(stcls, eg.c)
|
||||||
tokens.set_parse(stcls._sent)
|
tokens.set_parse(stcls._sent)
|
||||||
|
|
||||||
cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
|
cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
|
||||||
while not stcls.is_final():
|
while not stcls.is_final():
|
||||||
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
|
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
|
||||||
|
|
||||||
self.moves.set_valid(eg.is_valid, stcls)
|
self.moves.set_valid(eg.is_valid, stcls)
|
||||||
fill_context(eg.atoms, stcls)
|
fill_context(eg.atoms, stcls)
|
||||||
self.model.set_scores(eg.scores, eg.atoms)
|
self.model.set_scores(eg.scores, eg.atoms)
|
||||||
eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
|
eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
|
||||||
|
|
||||||
self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
|
self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
|
||||||
self.moves.finalize_state(stcls)
|
self.moves.finalize_state(stcls)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
from ..vocab cimport EMPTY_LEXEME
|
||||||
|
|
|
@ -33,6 +33,11 @@ cdef class TransitionSystem:
|
||||||
self.freqs = {}
|
self.freqs = {}
|
||||||
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
|
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
|
||||||
self.freqs[attr] = defaultdict(int)
|
self.freqs[attr] = defaultdict(int)
|
||||||
|
self.freqs[attr][0] = 1
|
||||||
|
# Ensure we've seen heads. Need an official dependency length limit...
|
||||||
|
for i in range(512):
|
||||||
|
self.freqs[HEAD][i] = 1
|
||||||
|
self.freqs[HEAD][-i] = 1
|
||||||
|
|
||||||
cdef int initialize_state(self, StateClass state) except -1:
|
cdef int initialize_state(self, StateClass state) except -1:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -71,17 +71,6 @@ cdef class Doc:
|
||||||
self.is_tagged = False
|
self.is_tagged = False
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
cdef const LexemeC* lex
|
|
||||||
cdef attr_t orth
|
|
||||||
cdef bint space
|
|
||||||
if orths_and_spaces is not None:
|
|
||||||
for orth, space in orths_and_spaces:
|
|
||||||
lex = <LexemeC*>self.vocab._by_orth.get(orth)
|
|
||||||
if lex != NULL:
|
|
||||||
assert lex.orth == orth
|
|
||||||
self.push_back(lex, space)
|
|
||||||
else:
|
|
||||||
raise Exception('Lexeme not found: %d' % orth)
|
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a token.
|
"""Get a token.
|
||||||
|
@ -122,9 +111,12 @@ cdef class Doc:
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return u''.join([t.string for t in self])
|
return u''.join([t.string for t in self])
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return u''.join([t.string for t in self])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def string(self):
|
def string(self):
|
||||||
return unicode(self)
|
return u''.join([t.string for t in self])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ents(self):
|
def ents(self):
|
||||||
|
@ -303,12 +295,11 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
bits = self.vocab.packer.pack(self)
|
byte_string = self.vocab.serializer.pack(self)
|
||||||
return struct.pack('I', len(bits)) + bits.as_bytes()
|
return struct.pack('I', len(byte_string)) + byte_string
|
||||||
|
|
||||||
def from_bytes(self, data):
|
def from_bytes(self, data):
|
||||||
bits = BitArray(data)
|
self.vocab.serializer.unpack_into(data[4:], self)
|
||||||
self.vocab.packer.unpack_into(bits, self)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -316,15 +307,14 @@ cdef class Doc:
|
||||||
keep_reading = True
|
keep_reading = True
|
||||||
while keep_reading:
|
while keep_reading:
|
||||||
try:
|
try:
|
||||||
n_bits_str = file_.read(4)
|
n_bytes_str = file_.read(4)
|
||||||
if len(n_bits_str) < 4:
|
if len(n_bytes_str) < 4:
|
||||||
break
|
break
|
||||||
n_bits = struct.unpack('I', n_bits_str)[0]
|
n_bytes = struct.unpack('I', n_bytes_str)[0]
|
||||||
n_bytes = n_bits // 8 + bool(n_bits % 8)
|
|
||||||
data = file_.read(n_bytes)
|
data = file_.read(n_bytes)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
keep_reading = False
|
keep_reading = False
|
||||||
yield data
|
yield n_bytes_str + data
|
||||||
|
|
||||||
# This function is terrible --- need to fix this.
|
# This function is terrible --- need to fix this.
|
||||||
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
||||||
|
|
|
@ -34,6 +34,9 @@ cdef class Token:
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return self.string
|
return self.string
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.string
|
||||||
|
|
||||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||||
return check_flag(self.c.lex, flag_id)
|
return check_flag(self.c.lex, flag_id)
|
||||||
|
|
||||||
|
|
|
@ -65,16 +65,6 @@ def read_tokenization(lang):
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def read_encoding_freqs(data_dir):
|
|
||||||
tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
|
|
||||||
heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
|
|
||||||
deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
|
|
||||||
iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
|
|
||||||
ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
|
|
||||||
return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
|
|
||||||
(ENT_TYPE, ne_types)]
|
|
||||||
|
|
||||||
|
|
||||||
def read_detoken_rules(lang): # Deprecated?
|
def read_detoken_rules(lang): # Deprecated?
|
||||||
loc = path.join(DATA_DIR, lang, 'detokenize')
|
loc = path.join(DATA_DIR, lang, 'detokenize')
|
||||||
entries = []
|
entries = []
|
||||||
|
|
|
@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from .structs cimport LexemeC, TokenC
|
from .structs cimport LexemeC, TokenC
|
||||||
from .typedefs cimport utf8_t, hash_t
|
from .typedefs cimport utf8_t, attr_t, hash_t
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,9 +29,12 @@ cdef class Vocab:
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef readonly object pos_tags
|
cdef readonly object pos_tags
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
cdef public object packer
|
cdef public object _serializer
|
||||||
|
cdef public object data_dir
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||||
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||||
|
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
|
|
||||||
cdef PreshMap _by_hash
|
cdef PreshMap _by_hash
|
||||||
|
|
108
spacy/vocab.pyx
108
spacy/vocab.pyx
|
@ -1,3 +1,6 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
@ -6,6 +9,7 @@ import bz2
|
||||||
from os import path
|
from os import path
|
||||||
import codecs
|
import codecs
|
||||||
import math
|
import math
|
||||||
|
import json
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport set_lex_struct_props
|
from .lexeme cimport set_lex_struct_props
|
||||||
|
@ -13,6 +17,7 @@ from .lexeme cimport Lexeme
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
|
from .cfile cimport CFile
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
from . import util
|
from . import util
|
||||||
|
@ -54,8 +59,19 @@ cdef class Vocab:
|
||||||
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
|
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
|
||||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||||
|
|
||||||
#self.packer = Packer(self, util.read_encoding_freqs(data_dir))
|
self._serializer = None
|
||||||
self.packer = None
|
self.data_dir = data_dir
|
||||||
|
|
||||||
|
property serializer:
|
||||||
|
def __get__(self):
|
||||||
|
if self._serializer is None:
|
||||||
|
freqs = []
|
||||||
|
if self.data_dir is not None:
|
||||||
|
freqs_loc = path.join(self.data_dir, 'serializer.json')
|
||||||
|
if path.exists(freqs_loc):
|
||||||
|
freqs = json.load(open(freqs_loc))
|
||||||
|
self._serializer = Packer(self, freqs)
|
||||||
|
return self._serializer
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
|
@ -82,6 +98,27 @@ cdef class Vocab:
|
||||||
self._add_lex_to_vocab(key, lex)
|
self._add_lex_to_vocab(key, lex)
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||||
|
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
|
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||||
|
cdef LexemeC* lex
|
||||||
|
lex = <LexemeC*>self._by_orth.get(orth)
|
||||||
|
if lex != NULL:
|
||||||
|
return lex
|
||||||
|
cdef unicode string = self.strings[orth]
|
||||||
|
cdef bint is_oov = mem is not self.mem
|
||||||
|
if len(string) < 3:
|
||||||
|
mem = self.mem
|
||||||
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
|
props = self.lexeme_props_getter(string)
|
||||||
|
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||||
|
if is_oov:
|
||||||
|
lex.id = 0
|
||||||
|
else:
|
||||||
|
self._add_lex_to_vocab(hash_string(string), lex)
|
||||||
|
return lex
|
||||||
|
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||||
self._by_hash.set(key, <void*>lex)
|
self._by_hash.set(key, <void*>lex)
|
||||||
self._by_orth.set(lex.orth, <void*>lex)
|
self._by_orth.set(lex.orth, <void*>lex)
|
||||||
|
@ -138,19 +175,16 @@ cdef class Vocab:
|
||||||
if path.exists(loc):
|
if path.exists(loc):
|
||||||
assert not path.isdir(loc)
|
assert not path.isdir(loc)
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
|
||||||
assert fp != NULL
|
cdef CFile fp = CFile(bytes_loc, 'wb')
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
for key, addr in self._by_hash.items():
|
for key, addr in self._by_hash.items():
|
||||||
lexeme = <LexemeC*>addr
|
lexeme = <LexemeC*>addr
|
||||||
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
|
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
|
||||||
assert st == 1
|
fp.write_from(lexeme, sizeof(LexemeC), 1)
|
||||||
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
|
fp.close()
|
||||||
assert st == 1
|
|
||||||
st = fclose(fp)
|
|
||||||
assert st == 0
|
|
||||||
|
|
||||||
def load_lexemes(self, strings_loc, loc):
|
def load_lexemes(self, strings_loc, loc):
|
||||||
self.strings.load(strings_loc)
|
self.strings.load(strings_loc)
|
||||||
|
@ -188,7 +222,7 @@ cdef class Vocab:
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
||||||
def load_rep_vectors(self, loc):
|
def load_rep_vectors(self, loc):
|
||||||
file_ = _CFile(loc, b'rb')
|
cdef CFile file_ = CFile(loc, b'rb')
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
cdef int32_t vec_len
|
cdef int32_t vec_len
|
||||||
cdef int32_t prev_vec_len = 0
|
cdef int32_t prev_vec_len = 0
|
||||||
|
@ -198,22 +232,20 @@ cdef class Vocab:
|
||||||
cdef bytes py_word
|
cdef bytes py_word
|
||||||
cdef vector[float*] vectors
|
cdef vector[float*] vectors
|
||||||
cdef int i
|
cdef int i
|
||||||
|
cdef Pool tmp_mem = Pool()
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
file_.read(&word_len, sizeof(word_len), 1)
|
file_.read_into(&word_len, sizeof(word_len), 1)
|
||||||
except IOError:
|
except IOError:
|
||||||
break
|
break
|
||||||
file_.read(&vec_len, sizeof(vec_len), 1)
|
file_.read_into(&vec_len, sizeof(vec_len), 1)
|
||||||
if prev_vec_len != 0 and vec_len != prev_vec_len:
|
if prev_vec_len != 0 and vec_len != prev_vec_len:
|
||||||
raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
|
raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
|
||||||
if 0 >= vec_len >= MAX_VEC_SIZE:
|
if 0 >= vec_len >= MAX_VEC_SIZE:
|
||||||
raise VectorReadError.bad_size(loc, vec_len)
|
raise VectorReadError.bad_size(loc, vec_len)
|
||||||
mem = Address(word_len, sizeof(char))
|
|
||||||
chars = <char*>mem.ptr
|
|
||||||
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
|
|
||||||
|
|
||||||
file_.read(chars, sizeof(char), word_len)
|
chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
|
||||||
file_.read(vec, sizeof(float), vec_len)
|
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
|
||||||
|
|
||||||
string_id = self.strings[chars[:word_len]]
|
string_id = self.strings[chars[:word_len]]
|
||||||
while string_id >= vectors.size():
|
while string_id >= vectors.size():
|
||||||
|
@ -235,7 +267,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
|
|
||||||
def write_binary_vectors(in_loc, out_loc):
|
def write_binary_vectors(in_loc, out_loc):
|
||||||
cdef _CFile out_file = _CFile(out_loc, 'wb')
|
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||||
cdef Address mem
|
cdef Address mem
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
cdef int32_t vec_len
|
cdef int32_t vec_len
|
||||||
|
@ -252,42 +284,12 @@ def write_binary_vectors(in_loc, out_loc):
|
||||||
word_len = len(word)
|
word_len = len(word)
|
||||||
vec_len = len(pieces)
|
vec_len = len(pieces)
|
||||||
|
|
||||||
out_file.write(sizeof(word_len), 1, &word_len)
|
out_file.write_from(&word_len, 1, sizeof(word_len))
|
||||||
out_file.write(sizeof(vec_len), 1, &vec_len)
|
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
||||||
|
|
||||||
chars = <char*>word
|
chars = <char*>word
|
||||||
out_file.write(sizeof(char), len(word), chars)
|
out_file.write_from(chars, len(word), sizeof(char))
|
||||||
out_file.write(sizeof(float), vec_len, vec)
|
out_file.write_from(vec, vec_len, sizeof(float))
|
||||||
|
|
||||||
|
|
||||||
cdef class _CFile:
|
|
||||||
cdef FILE* fp
|
|
||||||
def __init__(self, loc, bytes mode):
|
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
|
||||||
self.fp = fopen(<char*>bytes_loc, mode)
|
|
||||||
if self.fp == NULL:
|
|
||||||
raise IOError
|
|
||||||
|
|
||||||
def __dealloc__(self):
|
|
||||||
fclose(self.fp)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
fclose(self.fp)
|
|
||||||
|
|
||||||
cdef int read(self, void* dest, size_t elem_size, size_t n) except -1:
|
|
||||||
st = fread(dest, elem_size, n, self.fp)
|
|
||||||
if st != n:
|
|
||||||
raise IOError
|
|
||||||
|
|
||||||
cdef int write(self, size_t elem_size, size_t n, void* data) except -1:
|
|
||||||
st = fwrite(data, elem_size, n, self.fp)
|
|
||||||
if st != n:
|
|
||||||
raise IOError
|
|
||||||
|
|
||||||
cdef int write_unicode(self, unicode value):
|
|
||||||
cdef bytes py_bytes = value.encode('utf8')
|
|
||||||
cdef char* chars = <char*>py_bytes
|
|
||||||
self.write(sizeof(char), len(py_bytes), chars)
|
|
||||||
|
|
||||||
|
|
||||||
class VectorReadError(Exception):
|
class VectorReadError(Exception):
|
||||||
|
|
|
@ -7,3 +7,19 @@ import os
|
||||||
def EN():
|
def EN():
|
||||||
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
|
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
|
||||||
return English(data_dir=data_dir)
|
return English(data_dir=data_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_addoption(parser):
|
||||||
|
parser.addoption("--models", action="store_true",
|
||||||
|
help="include tests that require full models")
|
||||||
|
parser.addoption("--vectors", action="store_true",
|
||||||
|
help="include word vectors tests")
|
||||||
|
parser.addoption("--slow", action="store_true",
|
||||||
|
help="include slow tests")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_runtest_setup(item):
|
||||||
|
for opt in ['models', 'vectors', 'slow']:
|
||||||
|
if opt in item.keywords and not item.config.getoption("--%s" % opt):
|
||||||
|
pytest.skip("need --%s option to run" % opt)
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_simple_types(EN):
|
def test_simple_types(EN):
|
||||||
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
|
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
|
||||||
ents = list(tokens.ents)
|
ents = list(tokens.ents)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_root(EN):
|
def test_root(EN):
|
||||||
tokens = EN(u"i don't have other assistance")
|
tokens = EN(u"i don't have other assistance")
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
|
|
|
@ -12,6 +12,7 @@ def sun_text():
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_consistency(EN, sun_text):
|
def test_consistency(EN, sun_text):
|
||||||
tokens = EN(sun_text)
|
tokens = EN(sun_text)
|
||||||
for head in tokens:
|
for head in tokens:
|
||||||
|
@ -21,6 +22,7 @@ def test_consistency(EN, sun_text):
|
||||||
assert child.head is head
|
assert child.head is head
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_child_consistency(EN, sun_text):
|
def test_child_consistency(EN, sun_text):
|
||||||
tokens = EN(sun_text)
|
tokens = EN(sun_text)
|
||||||
|
|
||||||
|
@ -53,6 +55,7 @@ def test_child_consistency(EN, sun_text):
|
||||||
assert not children
|
assert not children
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_edges(EN):
|
def test_edges(EN):
|
||||||
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
|
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
|
||||||
tokens = EN(sun_text)
|
tokens = EN(sun_text)
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_subtrees(EN):
|
def test_subtrees(EN):
|
||||||
sent = EN('The four wheels on the bus turned quickly')
|
sent = EN('The four wheels on the bus turned quickly')
|
||||||
wheels = sent[2]
|
wheels = sent[2]
|
||||||
|
|
|
@ -45,7 +45,7 @@ def test1():
|
||||||
codec = HuffmanCodec(list(enumerate(probs)))
|
codec = HuffmanCodec(list(enumerate(probs)))
|
||||||
|
|
||||||
py_codes = py_encode(dict(enumerate(probs)))
|
py_codes = py_encode(dict(enumerate(probs)))
|
||||||
py_codes = py_codes.items()
|
py_codes = list(py_codes.items())
|
||||||
py_codes.sort()
|
py_codes.sort()
|
||||||
assert codec.strings == [c for i, c in py_codes]
|
assert codec.strings == [c for i, c in py_codes]
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ def test_round_trip():
|
||||||
strings = list(codec.strings)
|
strings = list(codec.strings)
|
||||||
codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
|
codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
|
||||||
bits = codec.encode(message)
|
bits = codec.encode(message)
|
||||||
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
|
string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes())
|
||||||
for word in message:
|
for word in message:
|
||||||
code = codes[word]
|
code = codes[word]
|
||||||
assert string[:len(code)] == code
|
assert string[:len(code)] == code
|
||||||
|
@ -76,7 +76,7 @@ def test_rosetta():
|
||||||
symb2freq = defaultdict(int)
|
symb2freq = defaultdict(int)
|
||||||
for ch in txt:
|
for ch in txt:
|
||||||
symb2freq[ch] += 1
|
symb2freq[ch] += 1
|
||||||
by_freq = symb2freq.items()
|
by_freq = list(symb2freq.items())
|
||||||
by_freq.sort(reverse=True, key=lambda item: item[1])
|
by_freq.sort(reverse=True, key=lambda item: item[1])
|
||||||
symbols = [sym for sym, prob in by_freq]
|
symbols = [sym for sym, prob in by_freq]
|
||||||
|
|
||||||
|
@ -96,6 +96,7 @@ def test_rosetta():
|
||||||
assert my_exp_len == py_exp_len
|
assert my_exp_len == py_exp_len
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
def test_vocab(EN):
|
def test_vocab(EN):
|
||||||
codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
|
codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
|
||||||
expected_length = 0
|
expected_length = 0
|
||||||
|
@ -105,6 +106,7 @@ def test_vocab(EN):
|
||||||
assert 8 < expected_length < 15
|
assert 8 < expected_length < 15
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
def test_freqs():
|
def test_freqs():
|
||||||
freqs = []
|
freqs = []
|
||||||
words = []
|
words = []
|
||||||
|
|
23
tests/serialize/test_io.py
Normal file
23
tests/serialize/test_io.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy.serialize.packer import Packer
|
||||||
|
from spacy.attrs import ORTH, SPACY
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_write(EN):
|
||||||
|
doc1 = EN(u'This is a simple test. With a couple of sentences.')
|
||||||
|
doc2 = EN(u'This is another test document.')
|
||||||
|
|
||||||
|
with open('/tmp/spacy_docs.bin', 'wb') as file_:
|
||||||
|
file_.write(doc1.to_bytes())
|
||||||
|
file_.write(doc2.to_bytes())
|
||||||
|
|
||||||
|
with open('/tmp/spacy_docs.bin', 'rb') as file_:
|
||||||
|
bytes1, bytes2 = Doc.read_bytes(file_)
|
||||||
|
r1 = Doc(EN.vocab).from_bytes(bytes1)
|
||||||
|
r2 = Doc(EN.vocab).from_bytes(bytes2)
|
||||||
|
|
||||||
|
assert r1.string == doc1.string
|
||||||
|
assert r2.string == doc2.string
|
|
@ -56,12 +56,12 @@ def test_char_packer(vocab):
|
||||||
bits = BitArray()
|
bits = BitArray()
|
||||||
bits.seek(0)
|
bits.seek(0)
|
||||||
|
|
||||||
byte_str = b'the dog jumped'
|
byte_str = bytearray(b'the dog jumped')
|
||||||
packer.char_codec.encode(byte_str, bits)
|
packer.char_codec.encode(byte_str, bits)
|
||||||
bits.seek(0)
|
bits.seek(0)
|
||||||
result = [b''] * len(byte_str)
|
result = [b''] * len(byte_str)
|
||||||
packer.char_codec.decode(bits, result)
|
packer.char_codec.decode(bits, result)
|
||||||
assert b''.join(result) == byte_str
|
assert bytearray(result) == byte_str
|
||||||
|
|
||||||
|
|
||||||
def test_packer_unannotated(tokenizer):
|
def test_packer_unannotated(tokenizer):
|
||||||
|
@ -120,5 +120,3 @@ def test_packer_annotated(tokenizer):
|
||||||
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
|
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
|
||||||
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
|
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
|
||||||
assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
|
assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_merge_tokens(EN):
|
def test_merge_tokens(EN):
|
||||||
tokens = EN(u'Los Angeles start.')
|
tokens = EN(u'Los Angeles start.')
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -12,6 +14,7 @@ def test_merge_tokens(EN):
|
||||||
assert tokens[0].head.orth_ == 'start'
|
assert tokens[0].head.orth_ == 'start'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_merge_heads(EN):
|
def test_merge_heads(EN):
|
||||||
tokens = EN(u'I found a pilates class near work.')
|
tokens = EN(u'I found a pilates class near work.')
|
||||||
assert len(tokens) == 8
|
assert len(tokens) == 8
|
||||||
|
|
|
@ -9,6 +9,7 @@ def doc(EN):
|
||||||
return EN('This is a sentence. This is another sentence. And a third.')
|
return EN('This is a sentence. This is another sentence. And a third.')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_sent_spans(doc):
|
def test_sent_spans(doc):
|
||||||
sents = list(doc.sents)
|
sents = list(doc.sents)
|
||||||
assert sents[0].start == 0
|
assert sents[0].start == 0
|
||||||
|
@ -17,6 +18,7 @@ def test_sent_spans(doc):
|
||||||
assert sum(len(sent) for sent in sents) == len(doc)
|
assert sum(len(sent) for sent in sents) == len(doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_root(doc):
|
def test_root(doc):
|
||||||
np = doc[2:4]
|
np = doc[2:4]
|
||||||
assert len(np) == 2
|
assert len(np) == 2
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_am_pm(en_nlp):
|
def test_am_pm(en_nlp):
|
||||||
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
|
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
|
||||||
variants = ['a.m.', 'am', 'p.m.', 'pm']
|
variants = ['a.m.', 'am', 'p.m.', 'pm']
|
||||||
|
@ -14,7 +15,7 @@ def test_am_pm(en_nlp):
|
||||||
tokens = en_nlp(string, merge_mwes=True)
|
tokens = en_nlp(string, merge_mwes=True)
|
||||||
assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
|
assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
|
||||||
ents = list(tokens.ents)
|
ents = list(tokens.ents)
|
||||||
assert len(ents) == 1
|
assert len(ents) == 1, ents
|
||||||
assert ents[0].label_ == 'TIME', string
|
assert ents[0].label_ == 'TIME', string
|
||||||
if ents[0].start == 4 and ents[0].end == 5:
|
if ents[0].start == 4 and ents[0].end == 5:
|
||||||
assert ents[0].orth_ == '%s%s%s' % (num, space, var)
|
assert ents[0].orth_ == '%s%s%s' % (num, space, var)
|
||||||
|
|
|
@ -17,6 +17,7 @@ def lemmas(tagged):
|
||||||
return [t.lemma_ for t in tagged]
|
return [t.lemma_ for t in tagged]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_lemmas(lemmas, tagged):
|
def test_lemmas(lemmas, tagged):
|
||||||
assert lemmas[0] == 'banana'
|
assert lemmas[0] == 'banana'
|
||||||
assert lemmas[1] == 'in'
|
assert lemmas[1] == 'in'
|
||||||
|
|
|
@ -12,6 +12,7 @@ def morph_exc():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_load_exc(morph_exc):
|
def test_load_exc(morph_exc):
|
||||||
# Do this local as we want to modify it
|
# Do this local as we want to modify it
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
import six
|
import six
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_tag_names(EN):
|
def test_tag_names(EN):
|
||||||
tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
|
tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
|
||||||
pizza = tokens[2]
|
pizza = tokens[2]
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
|
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
|
||||||
from spacy.en.attrs import IS_LOWER
|
from spacy.en.attrs import IS_LOWER
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_1():
|
def test_1():
|
||||||
import spacy.en
|
import spacy.en
|
||||||
from spacy.parts_of_speech import ADV
|
from spacy.parts_of_speech import ADV
|
||||||
|
@ -21,6 +22,7 @@ def test_1():
|
||||||
assert o == -11.07155704498291
|
assert o == -11.07155704498291
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test2():
|
def test2():
|
||||||
import spacy.en
|
import spacy.en
|
||||||
from spacy.parts_of_speech import ADV
|
from spacy.parts_of_speech import ADV
|
||||||
|
@ -41,6 +43,7 @@ def test2():
|
||||||
-11.07155704498291
|
-11.07155704498291
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test3():
|
def test3():
|
||||||
import spacy.en
|
import spacy.en
|
||||||
from spacy.parts_of_speech import ADV
|
from spacy.parts_of_speech import ADV
|
||||||
|
|
|
@ -15,6 +15,7 @@ def test_attr_of_token(EN):
|
||||||
assert feats_array[0][0] != feats_array[0][1]
|
assert feats_array[0][0] != feats_array[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_tag(EN):
|
def test_tag(EN):
|
||||||
text = u'A nice sentence.'
|
text = u'A nice sentence.'
|
||||||
tokens = EN(text)
|
tokens = EN(text)
|
||||||
|
@ -26,6 +27,7 @@ def test_tag(EN):
|
||||||
assert feats_array[3][1] == tokens[3].tag
|
assert feats_array[3][1] == tokens[3].tag
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_dep(EN):
|
def test_dep(EN):
|
||||||
text = u'A nice sentence.'
|
text = u'A nice sentence.'
|
||||||
tokens = EN(text)
|
tokens = EN(text)
|
||||||
|
|
|
@ -4,6 +4,7 @@ import pytest
|
||||||
from spacy.parts_of_speech import ADV
|
from spacy.parts_of_speech import ADV
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_prob(EN):
|
def test_prob(EN):
|
||||||
tokens = EN(u'Give it back', parse=False)
|
tokens = EN(u'Give it back', parse=False)
|
||||||
give = tokens[0]
|
give = tokens[0]
|
||||||
|
|
|
@ -7,6 +7,7 @@ from spacy.en.attrs import IS_STOP
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_strings(EN):
|
def test_strings(EN):
|
||||||
tokens = EN(u'Give it back! He pleaded.')
|
tokens = EN(u'Give it back! He pleaded.')
|
||||||
token = tokens[0]
|
token = tokens[0]
|
||||||
|
|
|
@ -9,6 +9,7 @@ data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
|
||||||
# Let this have its own instances, as we have to be careful about memory here
|
# Let this have its own instances, as we have to be careful about memory here
|
||||||
# that's the point, after all
|
# that's the point, after all
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def get_orphan_token(text, i):
|
def get_orphan_token(text, i):
|
||||||
nlp = English(load_vectors=False, data_dir=data_dir)
|
nlp = English(load_vectors=False, data_dir=data_dir)
|
||||||
tokens = nlp(text)
|
tokens = nlp(text)
|
||||||
|
@ -18,6 +19,7 @@ def get_orphan_token(text, i):
|
||||||
return token
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_orphan():
|
def test_orphan():
|
||||||
orphan = get_orphan_token('An orphan token', 1)
|
orphan = get_orphan_token('An orphan token', 1)
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
@ -36,6 +38,7 @@ def _orphan_from_list(toks):
|
||||||
return lst
|
return lst
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_list_orphans():
|
def test_list_orphans():
|
||||||
# Test case from NSchrading
|
# Test case from NSchrading
|
||||||
nlp = English(load_vectors=False, data_dir=data_dir)
|
nlp = English(load_vectors=False, data_dir=data_dir)
|
||||||
|
|
|
@ -5,7 +5,7 @@ from spacy.tokens import Doc
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_getitem(EN):
|
def mest_getitem(EN):
|
||||||
tokens = EN(u'Give it back! He pleaded.')
|
tokens = EN(u'Give it back! He pleaded.')
|
||||||
assert tokens[0].orth_ == 'Give'
|
assert tokens[0].orth_ == 'Give'
|
||||||
assert tokens[-1].orth_ == '.'
|
assert tokens[-1].orth_ == '.'
|
||||||
|
@ -13,10 +13,19 @@ def test_getitem(EN):
|
||||||
tokens[len(tokens)]
|
tokens[len(tokens)]
|
||||||
|
|
||||||
|
|
||||||
def test_serialize(EN):
|
def mest_serialize(EN):
|
||||||
tokens = EN(u' Give it back! He pleaded. ')
|
tokens = EN(u'Give it back! He pleaded.')
|
||||||
packed = tokens.serialize()
|
packed = tokens.to_bytes()
|
||||||
new_tokens = Doc.deserialize(EN.vocab, packed)
|
new_tokens = Doc(EN.vocab).from_bytes(packed)
|
||||||
|
assert tokens.string == new_tokens.string
|
||||||
|
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
|
||||||
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_whitespace(EN):
|
||||||
|
tokens = EN(u' Give it back! He pleaded. ')
|
||||||
|
packed = tokens.to_bytes()
|
||||||
|
new_tokens = Doc(EN.vocab).from_bytes(packed)
|
||||||
assert tokens.string == new_tokens.string
|
assert tokens.string == new_tokens.string
|
||||||
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
|
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
|
||||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
|
|
@ -4,13 +4,14 @@ from spacy.en import English
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@pytest.mark.vectors
|
||||||
def test_vec(EN):
|
def test_vec(EN):
|
||||||
hype = EN.vocab['hype']
|
hype = EN.vocab['hype']
|
||||||
assert hype.orth_ == 'hype'
|
assert hype.orth_ == 'hype'
|
||||||
assert 0.08 >= hype.repvec[0] > 0.07
|
assert 0.08 >= hype.repvec[0] > 0.07
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.vectors
|
||||||
def test_capitalized(EN):
|
def test_capitalized(EN):
|
||||||
hype = EN.vocab['Hype']
|
hype = EN.vocab['Hype']
|
||||||
assert hype.orth_ == 'Hype'
|
assert hype.orth_ == 'Hype'
|
||||||
|
|
|
@ -39,7 +39,7 @@ def test_retrieve_id(sstore):
|
||||||
|
|
||||||
def test_med_string(sstore):
|
def test_med_string(sstore):
|
||||||
nine_char_string = sstore[b'0123456789']
|
nine_char_string = sstore[b'0123456789']
|
||||||
assert sstore[nine_char_string] == b'0123456789'
|
assert sstore[nine_char_string] == u'0123456789'
|
||||||
dummy = sstore[b'A']
|
dummy = sstore[b'A']
|
||||||
assert sstore[b'0123456789'] == nine_char_string
|
assert sstore[b'0123456789'] == nine_char_string
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user