This commit is contained in:
Matthew Honnibal 2015-07-25 21:14:07 +02:00
commit 2e6a60eaec
52 changed files with 317092 additions and 195 deletions

View File

@ -11,11 +11,18 @@ python:
# install dependencies # install dependencies
install: install:
- "pip install --upgrade setuptools" - "pip install --upgrade setuptools"
- "rm -rf spacy/" - "pip install cython fabric fabtools"
- "pip install spacy" - "pip install -r requirements.txt"
- "python setup.py build_ext --inplace"
- "mkdir -p corpora/en"
- "cd corpora/en"
- "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
- "tar -xzf WordNet-3.0.tar.gz"
- "mv WordNet-3.0 wordnet"
- "cd ../../"
- "export PYTHONPATH=`pwd`"
- "python bin/init_model.py lang_data/en corpora/en spacy/en/data"
# run tests # run tests
script: script:
- py.test tests/tokenizer/ - "py.test tests/ -x"
- py.test tests/vocab/
- py.test tests/tagger/

27
bin/gather_freqs.py Normal file
View File

@ -0,0 +1,27 @@
import plac
def main(in_loc, out_loc):
out_file = open(out_loc, 'w')
this_key = None
this_freq = 0
df = 0
for line in open(in_loc):
line = line.strip()
if not line:
continue
freq, key = line.split('\t', 1)
freq = int(freq)
if this_key is not None and key != this_key:
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
this_key = key
this_freq = freq
df = 1
else:
this_freq += freq
df += 1
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
out_file.close()
if __name__ == '__main__':
plac.call(main)

View File

@ -15,6 +15,8 @@ Requires:
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
* vectors.tgz --- output of something like word2vec * vectors.tgz --- output of something like word2vec
""" """
from __future__ import unicode_literals
import plac import plac
from pathlib import Path from pathlib import Path
@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):
def _read_clusters(loc): def _read_clusters(loc):
if not loc.exists(): if not loc.exists():
print "Warning: Clusters file not found" print("Warning: Clusters file not found")
return {} return {}
clusters = {} clusters = {}
for line in codecs.open(str(loc), 'r', 'utf8'): for line in codecs.open(str(loc), 'r', 'utf8'):
@ -60,7 +62,7 @@ def _read_clusters(loc):
else: else:
clusters[word] = '0' clusters[word] = '0'
# Expand clusters with re-casing # Expand clusters with re-casing
for word, cluster in clusters.items(): for word, cluster in list(clusters.items()):
if word.lower() not in clusters: if word.lower() not in clusters:
clusters[word.lower()] = cluster clusters[word.lower()] = cluster
if word.title() not in clusters: if word.title() not in clusters:
@ -72,7 +74,7 @@ def _read_clusters(loc):
def _read_probs(loc): def _read_probs(loc):
if not loc.exists(): if not loc.exists():
print "Warning: Probabilities file not found" print("Warning: Probabilities file not found")
return {} return {}
probs = {} probs = {}
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
@ -85,7 +87,7 @@ def _read_probs(loc):
def _read_senses(loc): def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list)) lexicon = defaultdict(lambda: defaultdict(list))
if not loc.exists(): if not loc.exists():
print "Warning: WordNet senses not found" print("Warning: WordNet senses not found")
return lexicon return lexicon
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS)) sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ} pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
@ -109,13 +111,20 @@ def setup_vocab(src_dir, dst_dir):
if vectors_src.exists(): if vectors_src.exists():
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
else: else:
print "Warning: Word vectors file not found" print("Warning: Word vectors file not found")
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt') clusters = _read_clusters(src_dir / 'clusters.txt')
probs = _read_probs(src_dir / 'words.sgt.prob') probs = _read_probs(src_dir / 'words.sgt.prob')
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) if not probs:
min_prob = 0.0
else:
min_prob = min(probs.values())
for word in clusters:
if word not in probs:
probs[word] = min_prob
lexicon = [] lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
entry = get_lex_props(word) entry = get_lex_props(word)
if word in clusters or float(prob) >= -17: if word in clusters or float(prob) >= -17:
entry['prob'] = float(prob) entry['prob'] = float(prob)
@ -144,7 +153,7 @@ def main(lang_data_dir, corpora_dir, model_dir):
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
setup_vocab(corpora_dir, model_dir / 'vocab') setup_vocab(corpora_dir, model_dir / 'vocab')
if not (model_dir / 'wordnet').exists(): if not (model_dir / 'wordnet').exists():
copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet')) copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import division from __future__ import division
from __future__ import unicode_literals from __future__ import unicode_literals
from __future__ import print_function
import os import os
from os import path from os import path
@ -107,7 +108,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp = Language(data_dir=model_dir) nlp = Language(data_dir=model_dir)
print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %" print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter): for itn in range(n_iter):
scorer = Scorer() scorer = Scorer()
loss = 0 loss = 0
@ -138,9 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp.entity.train(tokens, gold) nlp.entity.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags) nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples) random.shuffle(gold_tuples)
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc, scorer.tags_acc,
scorer.token_acc) scorer.token_acc))
nlp.end_training() nlp.end_training()
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
@ -219,14 +220,14 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
scorer = evaluate(English, list(read_json_file(dev_loc)), scorer = evaluate(English, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose) model_dir, gold_preproc=gold_preproc, verbose=verbose)
print 'TOK', scorer.token_acc print('TOK', scorer.token_acc)
print 'POS', scorer.tags_acc print('POS', scorer.tags_acc)
print 'UAS', scorer.uas print('UAS', scorer.uas)
print 'LAS', scorer.las print('LAS', scorer.las)
print 'NER P', scorer.ents_p print('NER P', scorer.ents_p)
print 'NER R', scorer.ents_r print('NER R', scorer.ents_r)
print 'NER F', scorer.ents_f print('NER F', scorer.ents_f)
if __name__ == '__main__': if __name__ == '__main__':

316709
corpora/en/clusters.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@ cython
cymem == 1.11 cymem == 1.11
pathlib pathlib
preshed == 0.37 preshed == 0.37
thinc == 3.2 thinc == 3.3
murmurhash == 0.24 murmurhash == 0.24
unidecode unidecode
numpy numpy

View File

@ -120,7 +120,7 @@ def run_setup(exts):
ext_modules=exts, ext_modules=exts,
license="Dual: Commercial or AGPL", license="Dual: Commercial or AGPL",
install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37', install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
'thinc == 3.2', "unidecode", 'wget', 'plac', 'six', 'thinc == 3.3', "unidecode", 'wget', 'plac', 'six',
'ujson'], 'ujson'],
setup_requires=["headers_workaround"], setup_requires=["headers_workaround"],
) )
@ -162,6 +162,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.gold', 'spacy.orth', 'spacy.gold', 'spacy.orth',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile',
'spacy.syntax.ner'] 'spacy.syntax.ner']

12
spacy/cfile.pxd Normal file
View File

@ -0,0 +1,12 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
cdef class CFile:
cdef FILE* fp
cdef bint is_open
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *

40
spacy/cfile.pyx Normal file
View File

@ -0,0 +1,40 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
cdef class CFile:
def __init__(self, loc, mode):
if isinstance(mode, unicode):
mode_str = mode.encode('ascii')
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode_str)
if self.fp == NULL:
raise IOError("Could not open binary file %s" % bytes_loc)
self.is_open = True
def __dealloc__(self):
if self.is_open:
fclose(self.fp)
def close(self):
fclose(self.fp)
self.is_open = False
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
st = fread(dest, elem_size, number, self.fp)
if st != number:
raise IOError
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
st = fwrite(src, elem_size, number, self.fp)
if st != number:
raise IOError
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -95,15 +95,15 @@ class English(object):
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer')) self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
if Tagger: if Tagger and path.exists(path.join(data_dir, 'pos')):
self.tagger = Tagger(self.vocab.strings, data_dir) self.tagger = Tagger(self.vocab.strings, data_dir)
else: else:
self.tagger = None self.tagger = None
if Parser: if Parser and path.exists(path.join(data_dir, 'deps')):
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps')) self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
else: else:
self.parser = None self.parser = None
if Entity: if Entity and path.exists(path.join(data_dir, 'ner')):
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner')) self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
else: else:
self.entity = None self.entity = None
@ -153,15 +153,14 @@ class English(object):
self.tagger.model.end_training() self.tagger.model.end_training()
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
packer = Packer(self.vocab, [ with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
(TAG, self.tagger.moves.freqs[TAG].items()), file_.write(
(HEAD, self.parser.moves.freqs[HEAD].items()), json.dumps([
(DEP, self.parser.moves.freqs[DEP].items()), (TAG, self.tagger.freqs[TAG].items()),
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()), (DEP, self.parser.moves.freqs[DEP].items()),
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()) (ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
]) (ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()),
(HEAD, self.parser.moves.freqs[HEAD].items())]))
packer.dump(path.join(data_dir, 'vocab'))
@property @property
def tags(self): def tags(self):

View File

@ -14,6 +14,9 @@ from ..attrs cimport LEMMA as _LEMMA
from ..attrs cimport POS as _POS from ..attrs cimport POS as _POS
from ..attrs cimport TAG as _TAG from ..attrs cimport TAG as _TAG
from ..attrs cimport DEP as _DEP from ..attrs cimport DEP as _DEP
from ..attrs cimport HEAD as _HEAD
from ..attrs cimport ENT_IOB as _ENT_IOB
from ..attrs cimport ENT_TYPE as _ENT_TYPE
cpdef enum: cpdef enum:

View File

@ -262,6 +262,9 @@ cdef class EnPosTagger:
'morphs.json')))) 'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
self.freqs = {TAG: defaultdict(int)} self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.strings[tag]] = 1
self.freqs[TAG][0] = 1
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """Apply the tagger, setting the POS tags onto the Doc object.

View File

@ -3,8 +3,6 @@ from cymem.cymem cimport Pool
from .structs cimport TokenC from .structs cimport TokenC
from .syntax.transition_system cimport Transition from .syntax.transition_system cimport Transition
cimport numpy
cdef struct GoldParseC: cdef struct GoldParseC:
int* tags int* tags

View File

@ -1,7 +1,5 @@
import numpy import numpy
import codecs import codecs
import json
import ujson
import random import random
import re import re
import os import os
@ -9,6 +7,11 @@ from os import path
from libc.string cimport memset from libc.string cimport memset
try:
import ujson as json
except ImportError:
import json
def tags_to_entities(tags): def tags_to_entities(tags):
entities = [] entities = []
@ -128,7 +131,7 @@ def read_json_file(loc, docs_filter=None):
yield from read_json_file(path.join(loc, filename)) yield from read_json_file(path.join(loc, filename))
else: else:
with open(loc) as file_: with open(loc) as file_:
docs = ujson.load(file_) docs = json.load(file_)
for doc in docs: for doc in docs:
if docs_filter is not None and not docs_filter(doc): if docs_filter is not None and not docs_filter(doc):
continue continue

View File

@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
cdef class BitArray: cdef class BitArray:
cdef bytes data cdef bytearray data
cdef uchar byte cdef uchar byte
cdef uchar bit_of_byte cdef uchar bit_of_byte
cdef uint32_t i cdef uint32_t i

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
from libc.string cimport memcpy from libc.string cimport memcpy
# Note that we're setting the most significant bits here first, when in practice # Note that we're setting the most significant bits here first, when in practice
@ -15,7 +17,7 @@ cdef Code bit_append(Code code, bint bit) nogil:
cdef class BitArray: cdef class BitArray:
def __init__(self, data=b''): def __init__(self, data=b''):
self.data = data self.data = bytearray(data)
self.byte = 0 self.byte = 0
self.bit_of_byte = 0 self.bit_of_byte = 0
self.i = 0 self.i = 0
@ -45,7 +47,7 @@ cdef class BitArray:
start_bit = self.i % 8 start_bit = self.i % 8
if start_bit != 0 and start_byte < len(self.data): if start_bit != 0 and start_byte < len(self.data):
byte = ord(self.data[start_byte]) byte = self.data[start_byte]
for i in range(start_bit, 8): for i in range(start_bit, 8):
self.i += 1 self.i += 1
yield 1 if (byte & (one << i)) else 0 yield 1 if (byte & (one << i)) else 0
@ -68,18 +70,24 @@ cdef class BitArray:
# TODO portability # TODO portability
cdef uchar[4] chars cdef uchar[4] chars
chars[0] = <uchar>ord(self.data[start_byte]) chars[0] = self.data[start_byte]
chars[1] = <uchar>ord(self.data[start_byte+1]) chars[1] = self.data[start_byte+1]
chars[2] = <uchar>ord(self.data[start_byte+2]) chars[2] = self.data[start_byte+2]
chars[3] = <uchar>ord(self.data[start_byte+3]) chars[3] = self.data[start_byte+3]
cdef uint32_t output cdef uint32_t output
memcpy(&output, chars, 4) memcpy(&output, chars, 4)
self.i += 32 self.i += 32
return output return output
def as_bytes(self): def as_bytes(self):
cdef unsigned char byte_char
if self.bit_of_byte != 0: if self.bit_of_byte != 0:
return self.data + chr(self.byte) byte = chr(self.byte)
# Jump through some hoops for Python3
if isinstance(byte, unicode):
return self.data + <bytes>(&self.byte)[:1]
else:
return self.data + chr(self.byte)
else: else:
return self.data return self.data
@ -92,7 +100,7 @@ cdef class BitArray:
self.bit_of_byte += 1 self.bit_of_byte += 1
self.i += 1 self.i += 1
if self.bit_of_byte == 8: if self.bit_of_byte == 8:
self.data += chr(self.byte) self.data += bytearray((self.byte,))
self.byte = 0 self.byte = 0
self.bit_of_byte = 0 self.bit_of_byte = 0
@ -106,7 +114,7 @@ cdef class BitArray:
self.byte &= ~(one << self.bit_of_byte) self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1 self.bit_of_byte += 1
if self.bit_of_byte == 8: if self.bit_of_byte == 8:
self.data += chr(self.byte) self.data += <bytes>self.byte
self.byte = 0 self.byte = 0
self.bit_of_byte = 0 self.bit_of_byte = 0
self.i += 1 self.i += 1

View File

@ -1,4 +1,5 @@
# cython: profile=True # cython: profile=True
from __future__ import unicode_literals
cimport cython cimport cython
from libcpp.queue cimport priority_queue from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair from libcpp.pair cimport pair
@ -110,14 +111,14 @@ cdef class HuffmanCodec:
cdef int branch cdef int branch
cdef int n_msg = msg.shape[0] cdef int n_msg = msg.shape[0]
cdef bytes bytes_ = bits.as_bytes() cdef bytearray bytes_ = bits.as_bytes()
cdef unsigned char byte cdef unsigned char byte
cdef int i_msg = 0 cdef int i_msg = 0
cdef int i_byte = bits.i // 8 cdef int i_byte = bits.i // 8
cdef unsigned char i_bit = 0 cdef unsigned char i_bit = 0
cdef unsigned char one = 1 cdef unsigned char one = 1
while i_msg < n_msg: while i_msg < n_msg:
byte = ord(bytes_[i_byte]) byte = bytes_[i_byte]
i_byte += 1 i_byte += 1
for i_bit in range(8): for i_bit in range(8):
branch = node.right if (byte & (one << i_bit)) else node.left branch = node.right if (byte & (one << i_bit)) else node.left
@ -138,11 +139,11 @@ cdef class HuffmanCodec:
def __get__(self): def __get__(self):
output = [] output = []
cdef int i, j cdef int i, j
cdef bytes string cdef unicode string
cdef Code code cdef Code code
for i in range(self.codes.size()): for i in range(self.codes.size()):
code = self.codes[i] code = self.codes[i]
string = b'{0:b}'.format(code.bits).rjust(code.length, '0') string = '{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1] string = string[::-1]
output.append(string) output.append(string)
return output return output

View File

@ -10,6 +10,7 @@ from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter from preshed.counter cimport PreshCounter
import json
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
@ -65,7 +66,7 @@ def _gen_orths(Vocab vocab):
def _gen_chars(Vocab vocab): def _gen_chars(Vocab vocab):
cdef attr_t orth cdef attr_t orth
cdef size_t addr cdef size_t addr
char_weights = {chr(i): 1e-20 for i in range(256)} char_weights = {i: 1e-20 for i in range(256)}
cdef unicode string cdef unicode string
cdef bytes char cdef bytes char
cdef bytes utf8_str cdef bytes utf8_str
@ -74,9 +75,9 @@ def _gen_chars(Vocab vocab):
string = vocab.strings[lex.orth] string = vocab.strings[lex.orth]
utf8_str = string.encode('utf8') utf8_str = string.encode('utf8')
for char in utf8_str: for char in utf8_str:
char_weights.setdefault(char, 0.0) char_weights.setdefault(ord(char), 0.0)
char_weights[char] += c_exp(lex.prob) char_weights[ord(char)] += c_exp(lex.prob)
char_weights[b' '] += c_exp(lex.prob) char_weights[ord(' ')] += c_exp(lex.prob)
return char_weights.items() return char_weights.items()
@ -98,33 +99,34 @@ cdef class Packer:
self._codecs = tuple(codecs) self._codecs = tuple(codecs)
self.attrs = tuple(attrs) self.attrs = tuple(attrs)
@classmethod
def from_dir(cls, Vocab vocab, data_dir):
return cls(vocab, util.read_encoding_freqs(data_dir))
def pack(self, Doc doc): def pack(self, Doc doc):
bits = self._orth_encode(doc) bits = self._orth_encode(doc)
if bits is None: if bits is None:
bits = self._char_encode(doc) bits = self._char_encode(doc)
cdef int i cdef int i
if self.attrs: if self.attrs:
array = doc.to_array(self.attrs) array = doc.to_array(self.attrs)
for i, codec in enumerate(self._codecs): for i, codec in enumerate(self._codecs):
codec.encode_int32(array[:, i], bits) codec.encode(array[:, i], bits)
return bits return bits.as_bytes()
def unpack(self, BitArray bits): def unpack(self, data):
doc = Doc(self.vocab)
self.unpack_into(data, doc)
return doc
def unpack_into(self, byte_string, Doc doc):
bits = BitArray(byte_string)
bits.seek(0) bits.seek(0)
cdef int32_t length = bits.read32() cdef int32_t length = bits.read32()
if length >= 0: if length >= 0:
doc = self._orth_decode(bits, length) self._orth_decode(bits, length, doc)
else: else:
doc = self._char_decode(bits, -length) self._char_decode(bits, -length, doc)
array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32) array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
for i, codec in enumerate(self._codecs): for i, codec in enumerate(self._codecs):
codec.decode_int32(bits, array[:, i]) codec.decode(bits, array[:, i])
doc.from_array(self.attrs, array) doc.from_array(self.attrs, array)
return doc return doc
@ -141,20 +143,13 @@ cdef class Packer:
bits.append(bool(token.whitespace_)) bits.append(bool(token.whitespace_))
return bits return bits
def _orth_decode(self, BitArray bits, n):
orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
self.orth_codec.decode_int32(bits, orths)
orths_and_spaces = zip(orths, bits)
cdef Doc doc = Doc(self.vocab, orths_and_spaces)
return doc
def _char_encode(self, Doc doc): def _char_encode(self, Doc doc):
cdef bytes utf8_str = doc.string.encode('utf8') cdef bytes utf8_str = doc.string.encode('utf8')
cdef BitArray bits = BitArray() cdef BitArray bits = BitArray()
cdef int32_t length = len(utf8_str) cdef int32_t length = len(utf8_str)
# Signal chars with negative length # Signal chars with negative length
bits.extend(-length, 32) bits.extend(-length, 32)
self.char_codec.encode(utf8_str, bits) self.char_codec.encode(bytearray(utf8_str), bits)
cdef int i, j cdef int i, j
for i in range(doc.length): for i in range(doc.length):
for j in range(doc.data[i].lex.length-1): for j in range(doc.data[i].lex.length-1):
@ -164,12 +159,24 @@ cdef class Packer:
bits.append(False) bits.append(False)
return bits return bits
def _char_decode(self, BitArray bits, n): def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
self.orth_codec.decode_int32(bits, orths)
cdef int i
cdef bint space
spaces = iter(bits)
for i in range(n):
orth = orths[i]
space = next(spaces)
lex = self.vocab.get_by_orth(doc.mem, orth)
doc.push_back(lex, space)
return doc
def _char_decode(self, BitArray bits, int32_t n, Doc doc):
cdef bytearray utf8_str = bytearray(n) cdef bytearray utf8_str = bytearray(n)
self.char_codec.decode(bits, utf8_str) self.char_codec.decode(bits, utf8_str)
cdef unicode string = utf8_str.decode('utf8') cdef unicode string = utf8_str.decode('utf8')
cdef Doc tokens = Doc(self.vocab)
cdef int start = 0 cdef int start = 0
cdef bint is_spacy cdef bint is_spacy
cdef int length = len(string) cdef int length = len(string)
@ -178,11 +185,11 @@ cdef class Packer:
for is_end_token in bits: for is_end_token in bits:
if is_end_token: if is_end_token:
span = string[start:i+1] span = string[start:i+1]
lex = self.vocab.get(tokens.mem, span) lex = self.vocab.get(doc.mem, span)
is_spacy = (i+1) < length and string[i+1] == u' ' is_spacy = (i+1) < length and string[i+1] == u' '
tokens.push_back(lex, is_spacy) doc.push_back(lex, is_spacy)
start = i + 1 + is_spacy start = i + 1 + is_spacy
i += 1 i += 1
if i >= n: if i >= n:
break break
return tokens return doc

View File

@ -81,6 +81,7 @@ cdef class StringStore:
def __getitem__(self, object string_or_id): def __getitem__(self, object string_or_id):
cdef bytes byte_string cdef bytes byte_string
cdef const Utf8Str* utf8str cdef const Utf8Str* utf8str
cdef int id_
if isinstance(string_or_id, int) or isinstance(string_or_id, long): if isinstance(string_or_id, int) or isinstance(string_or_id, long):
if string_or_id == 0: if string_or_id == 0:
return u'' return u''

View File

@ -1,4 +1,3 @@
# cython: profile=True
""" """
Fill an array, context, with every _atomic_ value our features reference. Fill an array, context, with every _atomic_ value our features reference.
We then write the _actual features_ as tuples of the atoms. The machinery We then write the _actual features_ as tuples of the atoms. The machinery

View File

@ -1,4 +1,3 @@
# cython: profile=True
from __future__ import unicode_literals from __future__ import unicode_literals
import ctypes import ctypes

View File

@ -85,6 +85,9 @@ cdef class BiluoPushDown(TransitionSystem):
elif gold.c.ner[i].move == OUT: elif gold.c.ner[i].move == OUT:
self.freqs[ENT_IOB][1] += 1 self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1 self.freqs[ENT_TYPE][0] += 1
else:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
if name == '-': if name == '-':

View File

@ -1,4 +1,3 @@
# cython: profile=True
""" """
MALT-style dependency parser MALT-style dependency parser
""" """
@ -85,18 +84,17 @@ cdef class Parser:
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
self.model.n_feats, self.model.n_feats) self.model.n_feats, self.model.n_feats)
self.parse(stcls, eg.c) with nogil:
self.parse(stcls, eg.c)
tokens.set_parse(stcls._sent) tokens.set_parse(stcls._sent)
cdef void parse(self, StateClass stcls, ExampleC eg) nogil: cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
while not stcls.is_final(): while not stcls.is_final():
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
self.moves.set_valid(eg.is_valid, stcls) self.moves.set_valid(eg.is_valid, stcls)
fill_context(eg.atoms, stcls) fill_context(eg.atoms, stcls)
self.model.set_scores(eg.scores, eg.atoms) self.model.set_scores(eg.scores, eg.atoms)
eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes) eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
self.moves.finalize_state(stcls) self.moves.finalize_state(stcls)

View File

@ -1,4 +1,3 @@
# cython: profile=True
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from ..vocab cimport EMPTY_LEXEME from ..vocab cimport EMPTY_LEXEME

View File

@ -33,6 +33,11 @@ cdef class TransitionSystem:
self.freqs = {} self.freqs = {}
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB): for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
self.freqs[attr] = defaultdict(int) self.freqs[attr] = defaultdict(int)
self.freqs[attr][0] = 1
# Ensure we've seen heads. Need an official dependency length limit...
for i in range(512):
self.freqs[HEAD][i] = 1
self.freqs[HEAD][-i] = 1
cdef int initialize_state(self, StateClass state) except -1: cdef int initialize_state(self, StateClass state) except -1:
pass pass

View File

@ -71,17 +71,6 @@ cdef class Doc:
self.is_tagged = False self.is_tagged = False
self.is_parsed = False self.is_parsed = False
self._py_tokens = [] self._py_tokens = []
cdef const LexemeC* lex
cdef attr_t orth
cdef bint space
if orths_and_spaces is not None:
for orth, space in orths_and_spaces:
lex = <LexemeC*>self.vocab._by_orth.get(orth)
if lex != NULL:
assert lex.orth == orth
self.push_back(lex, space)
else:
raise Exception('Lexeme not found: %d' % orth)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a token. """Get a token.
@ -122,9 +111,12 @@ cdef class Doc:
def __unicode__(self): def __unicode__(self):
return u''.join([t.string for t in self]) return u''.join([t.string for t in self])
def __str__(self):
return u''.join([t.string for t in self])
@property @property
def string(self): def string(self):
return unicode(self) return u''.join([t.string for t in self])
@property @property
def ents(self): def ents(self):
@ -303,12 +295,11 @@ cdef class Doc:
return self return self
def to_bytes(self): def to_bytes(self):
bits = self.vocab.packer.pack(self) byte_string = self.vocab.serializer.pack(self)
return struct.pack('I', len(bits)) + bits.as_bytes() return struct.pack('I', len(byte_string)) + byte_string
def from_bytes(self, data): def from_bytes(self, data):
bits = BitArray(data) self.vocab.serializer.unpack_into(data[4:], self)
self.vocab.packer.unpack_into(bits, self)
return self return self
@staticmethod @staticmethod
@ -316,15 +307,14 @@ cdef class Doc:
keep_reading = True keep_reading = True
while keep_reading: while keep_reading:
try: try:
n_bits_str = file_.read(4) n_bytes_str = file_.read(4)
if len(n_bits_str) < 4: if len(n_bytes_str) < 4:
break break
n_bits = struct.unpack('I', n_bits_str)[0] n_bytes = struct.unpack('I', n_bytes_str)[0]
n_bytes = n_bits // 8 + bool(n_bits % 8)
data = file_.read(n_bytes) data = file_.read(n_bytes)
except StopIteration: except StopIteration:
keep_reading = False keep_reading = False
yield data yield n_bytes_str + data
# This function is terrible --- need to fix this. # This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,

View File

@ -34,6 +34,9 @@ cdef class Token:
def __unicode__(self): def __unicode__(self):
return self.string return self.string
def __str__(self):
return self.string
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id) return check_flag(self.c.lex, flag_id)

View File

@ -65,16 +65,6 @@ def read_tokenization(lang):
return entries return entries
def read_encoding_freqs(data_dir):
tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
(ENT_TYPE, ne_types)]
def read_detoken_rules(lang): # Deprecated? def read_detoken_rules(lang): # Deprecated?
loc = path.join(DATA_DIR, lang, 'detokenize') loc = path.join(DATA_DIR, lang, 'detokenize')
entries = [] entries = []

View File

@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from .structs cimport LexemeC, TokenC from .structs cimport LexemeC, TokenC
from .typedefs cimport utf8_t, hash_t from .typedefs cimport utf8_t, attr_t, hash_t
from .strings cimport StringStore from .strings cimport StringStore
@ -29,9 +29,12 @@ cdef class Vocab:
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef readonly object pos_tags cdef readonly object pos_tags
cdef readonly int length cdef readonly int length
cdef public object packer cdef public object _serializer
cdef public object data_dir
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _by_hash cdef PreshMap _by_hash

View File

@ -1,3 +1,6 @@
from __future__ import unicode_literals
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset from libc.string cimport memset
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
@ -6,6 +9,7 @@ import bz2
from os import path from os import path
import codecs import codecs
import math import math
import json
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport set_lex_struct_props from .lexeme cimport set_lex_struct_props
@ -13,6 +17,7 @@ from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .orth cimport word_shape from .orth cimport word_shape
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile
from cymem.cymem cimport Address from cymem.cymem cimport Address
from . import util from . import util
@ -54,8 +59,19 @@ cdef class Vocab:
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
#self.packer = Packer(self, util.read_encoding_freqs(data_dir)) self._serializer = None
self.packer = None self.data_dir = data_dir
property serializer:
def __get__(self):
if self._serializer is None:
freqs = []
if self.data_dir is not None:
freqs_loc = path.join(self.data_dir, 'serializer.json')
if path.exists(freqs_loc):
freqs = json.load(open(freqs_loc))
self._serializer = Packer(self, freqs)
return self._serializer
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
@ -82,6 +98,27 @@ cdef class Vocab:
self._add_lex_to_vocab(key, lex) self._add_lex_to_vocab(key, lex)
return lex return lex
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef LexemeC* lex
lex = <LexemeC*>self._by_orth.get(orth)
if lex != NULL:
return lex
cdef unicode string = self.strings[orth]
cdef bint is_oov = mem is not self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov:
lex.id = 0
else:
self._add_lex_to_vocab(hash_string(string), lex)
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._by_hash.set(key, <void*>lex) self._by_hash.set(key, <void*>lex)
self._by_orth.set(lex.orth, <void*>lex) self._by_orth.set(lex.orth, <void*>lex)
@ -138,19 +175,16 @@ cdef class Vocab:
if path.exists(loc): if path.exists(loc):
assert not path.isdir(loc) assert not path.isdir(loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
assert fp != NULL cdef CFile fp = CFile(bytes_loc, 'wb')
cdef size_t st cdef size_t st
cdef size_t addr cdef size_t addr
cdef hash_t key cdef hash_t key
for key, addr in self._by_hash.items(): for key, addr in self._by_hash.items():
lexeme = <LexemeC*>addr lexeme = <LexemeC*>addr
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp) fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
assert st == 1 fp.write_from(lexeme, sizeof(LexemeC), 1)
st = fwrite(lexeme, sizeof(LexemeC), 1, fp) fp.close()
assert st == 1
st = fclose(fp)
assert st == 0
def load_lexemes(self, strings_loc, loc): def load_lexemes(self, strings_loc, loc):
self.strings.load(strings_loc) self.strings.load(strings_loc)
@ -188,7 +222,7 @@ cdef class Vocab:
fclose(fp) fclose(fp)
def load_rep_vectors(self, loc): def load_rep_vectors(self, loc):
file_ = _CFile(loc, b'rb') cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len cdef int32_t word_len
cdef int32_t vec_len cdef int32_t vec_len
cdef int32_t prev_vec_len = 0 cdef int32_t prev_vec_len = 0
@ -198,22 +232,20 @@ cdef class Vocab:
cdef bytes py_word cdef bytes py_word
cdef vector[float*] vectors cdef vector[float*] vectors
cdef int i cdef int i
cdef Pool tmp_mem = Pool()
while True: while True:
try: try:
file_.read(&word_len, sizeof(word_len), 1) file_.read_into(&word_len, sizeof(word_len), 1)
except IOError: except IOError:
break break
file_.read(&vec_len, sizeof(vec_len), 1) file_.read_into(&vec_len, sizeof(vec_len), 1)
if prev_vec_len != 0 and vec_len != prev_vec_len: if prev_vec_len != 0 and vec_len != prev_vec_len:
raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len) raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
if 0 >= vec_len >= MAX_VEC_SIZE: if 0 >= vec_len >= MAX_VEC_SIZE:
raise VectorReadError.bad_size(loc, vec_len) raise VectorReadError.bad_size(loc, vec_len)
mem = Address(word_len, sizeof(char))
chars = <char*>mem.ptr
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
file_.read(chars, sizeof(char), word_len) chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
file_.read(vec, sizeof(float), vec_len) vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
string_id = self.strings[chars[:word_len]] string_id = self.strings[chars[:word_len]]
while string_id >= vectors.size(): while string_id >= vectors.size():
@ -235,7 +267,7 @@ cdef class Vocab:
def write_binary_vectors(in_loc, out_loc): def write_binary_vectors(in_loc, out_loc):
cdef _CFile out_file = _CFile(out_loc, 'wb') cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem cdef Address mem
cdef int32_t word_len cdef int32_t word_len
cdef int32_t vec_len cdef int32_t vec_len
@ -252,42 +284,12 @@ def write_binary_vectors(in_loc, out_loc):
word_len = len(word) word_len = len(word)
vec_len = len(pieces) vec_len = len(pieces)
out_file.write(sizeof(word_len), 1, &word_len) out_file.write_from(&word_len, 1, sizeof(word_len))
out_file.write(sizeof(vec_len), 1, &vec_len) out_file.write_from(&vec_len, 1, sizeof(vec_len))
chars = <char*>word chars = <char*>word
out_file.write(sizeof(char), len(word), chars) out_file.write_from(chars, len(word), sizeof(char))
out_file.write(sizeof(float), vec_len, vec) out_file.write_from(vec, vec_len, sizeof(float))
cdef class _CFile:
cdef FILE* fp
def __init__(self, loc, bytes mode):
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode)
if self.fp == NULL:
raise IOError
def __dealloc__(self):
fclose(self.fp)
def close(self):
fclose(self.fp)
cdef int read(self, void* dest, size_t elem_size, size_t n) except -1:
st = fread(dest, elem_size, n, self.fp)
if st != n:
raise IOError
cdef int write(self, size_t elem_size, size_t n, void* data) except -1:
st = fwrite(data, elem_size, n, self.fp)
if st != n:
raise IOError
cdef int write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)
class VectorReadError(Exception): class VectorReadError(Exception):

View File

@ -7,3 +7,19 @@ import os
def EN(): def EN():
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
return English(data_dir=data_dir) return English(data_dir=data_dir)
def pytest_addoption(parser):
parser.addoption("--models", action="store_true",
help="include tests that require full models")
parser.addoption("--vectors", action="store_true",
help="include word vectors tests")
parser.addoption("--slow", action="store_true",
help="include slow tests")
def pytest_runtest_setup(item):
for opt in ['models', 'vectors', 'slow']:
if opt in item.keywords and not item.config.getoption("--%s" % opt):
pytest.skip("need --%s option to run" % opt)

View File

@ -1,4 +1,6 @@
import pytest
@pytest.mark.models
def test_simple_types(EN): def test_simple_types(EN):
tokens = EN(u'Mr. Best flew to New York on Saturday morning.') tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents) ents = list(tokens.ents)

View File

@ -1,6 +1,7 @@
import pytest import pytest
@pytest.mark.models
def test_root(EN): def test_root(EN):
tokens = EN(u"i don't have other assistance") tokens = EN(u"i don't have other assistance")
for t in tokens: for t in tokens:

View File

@ -12,6 +12,7 @@ def sun_text():
return text return text
@pytest.mark.models
def test_consistency(EN, sun_text): def test_consistency(EN, sun_text):
tokens = EN(sun_text) tokens = EN(sun_text)
for head in tokens: for head in tokens:
@ -21,6 +22,7 @@ def test_consistency(EN, sun_text):
assert child.head is head assert child.head is head
@pytest.mark.models
def test_child_consistency(EN, sun_text): def test_child_consistency(EN, sun_text):
tokens = EN(sun_text) tokens = EN(sun_text)
@ -53,6 +55,7 @@ def test_child_consistency(EN, sun_text):
assert not children assert not children
@pytest.mark.models
def test_edges(EN): def test_edges(EN):
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium." sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
tokens = EN(sun_text) tokens = EN(sun_text)

View File

@ -1,6 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_subtrees(EN): def test_subtrees(EN):
sent = EN('The four wheels on the bus turned quickly') sent = EN('The four wheels on the bus turned quickly')
wheels = sent[2] wheels = sent[2]

View File

@ -45,7 +45,7 @@ def test1():
codec = HuffmanCodec(list(enumerate(probs))) codec = HuffmanCodec(list(enumerate(probs)))
py_codes = py_encode(dict(enumerate(probs))) py_codes = py_encode(dict(enumerate(probs)))
py_codes = py_codes.items() py_codes = list(py_codes.items())
py_codes.sort() py_codes.sort()
assert codec.strings == [c for i, c in py_codes] assert codec.strings == [c for i, c in py_codes]
@ -60,7 +60,7 @@ def test_round_trip():
strings = list(codec.strings) strings = list(codec.strings)
codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))} codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
bits = codec.encode(message) bits = codec.encode(message)
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes()) string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes())
for word in message: for word in message:
code = codes[word] code = codes[word]
assert string[:len(code)] == code assert string[:len(code)] == code
@ -76,7 +76,7 @@ def test_rosetta():
symb2freq = defaultdict(int) symb2freq = defaultdict(int)
for ch in txt: for ch in txt:
symb2freq[ch] += 1 symb2freq[ch] += 1
by_freq = symb2freq.items() by_freq = list(symb2freq.items())
by_freq.sort(reverse=True, key=lambda item: item[1]) by_freq.sort(reverse=True, key=lambda item: item[1])
symbols = [sym for sym, prob in by_freq] symbols = [sym for sym, prob in by_freq]
@ -96,6 +96,7 @@ def test_rosetta():
assert my_exp_len == py_exp_len assert my_exp_len == py_exp_len
@pytest.mark.slow
def test_vocab(EN): def test_vocab(EN):
codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab]) codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
expected_length = 0 expected_length = 0
@ -105,6 +106,7 @@ def test_vocab(EN):
assert 8 < expected_length < 15 assert 8 < expected_length < 15
@pytest.mark.slow
def test_freqs(): def test_freqs():
freqs = [] freqs = []
words = [] words = []

View File

@ -0,0 +1,23 @@
import pytest
from spacy.serialize.packer import Packer
from spacy.attrs import ORTH, SPACY
from spacy.tokens import Doc
import math
def test_read_write(EN):
doc1 = EN(u'This is a simple test. With a couple of sentences.')
doc2 = EN(u'This is another test document.')
with open('/tmp/spacy_docs.bin', 'wb') as file_:
file_.write(doc1.to_bytes())
file_.write(doc2.to_bytes())
with open('/tmp/spacy_docs.bin', 'rb') as file_:
bytes1, bytes2 = Doc.read_bytes(file_)
r1 = Doc(EN.vocab).from_bytes(bytes1)
r2 = Doc(EN.vocab).from_bytes(bytes2)
assert r1.string == doc1.string
assert r2.string == doc2.string

View File

@ -56,12 +56,12 @@ def test_char_packer(vocab):
bits = BitArray() bits = BitArray()
bits.seek(0) bits.seek(0)
byte_str = b'the dog jumped' byte_str = bytearray(b'the dog jumped')
packer.char_codec.encode(byte_str, bits) packer.char_codec.encode(byte_str, bits)
bits.seek(0) bits.seek(0)
result = [b''] * len(byte_str) result = [b''] * len(byte_str)
packer.char_codec.decode(bits, result) packer.char_codec.decode(bits, result)
assert b''.join(result) == byte_str assert bytearray(result) == byte_str
def test_packer_unannotated(tokenizer): def test_packer_unannotated(tokenizer):
@ -120,5 +120,3 @@ def test_packer_annotated(tokenizer):
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD'] assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT'] assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in result] == [1, 1, 0] assert [(t.head.i - t.i) for t in result] == [1, 1, 0]

View File

@ -1,6 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_merge_tokens(EN): def test_merge_tokens(EN):
tokens = EN(u'Los Angeles start.') tokens = EN(u'Los Angeles start.')
assert len(tokens) == 4 assert len(tokens) == 4
@ -12,6 +14,7 @@ def test_merge_tokens(EN):
assert tokens[0].head.orth_ == 'start' assert tokens[0].head.orth_ == 'start'
@pytest.mark.models
def test_merge_heads(EN): def test_merge_heads(EN):
tokens = EN(u'I found a pilates class near work.') tokens = EN(u'I found a pilates class near work.')
assert len(tokens) == 8 assert len(tokens) == 8

View File

@ -9,6 +9,7 @@ def doc(EN):
return EN('This is a sentence. This is another sentence. And a third.') return EN('This is a sentence. This is another sentence. And a third.')
@pytest.mark.models
def test_sent_spans(doc): def test_sent_spans(doc):
sents = list(doc.sents) sents = list(doc.sents)
assert sents[0].start == 0 assert sents[0].start == 0
@ -17,6 +18,7 @@ def test_sent_spans(doc):
assert sum(len(sent) for sent in sents) == len(doc) assert sum(len(sent) for sent in sents) == len(doc)
@pytest.mark.models
def test_root(doc): def test_root(doc):
np = doc[2:4] np = doc[2:4]
assert len(np) == 2 assert len(np) == 2

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.models
def test_am_pm(en_nlp): def test_am_pm(en_nlp):
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
variants = ['a.m.', 'am', 'p.m.', 'pm'] variants = ['a.m.', 'am', 'p.m.', 'pm']
@ -14,7 +15,7 @@ def test_am_pm(en_nlp):
tokens = en_nlp(string, merge_mwes=True) tokens = en_nlp(string, merge_mwes=True)
assert tokens[4].orth_ == '%s%s%s' % (num, space, var) assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
ents = list(tokens.ents) ents = list(tokens.ents)
assert len(ents) == 1 assert len(ents) == 1, ents
assert ents[0].label_ == 'TIME', string assert ents[0].label_ == 'TIME', string
if ents[0].start == 4 and ents[0].end == 5: if ents[0].start == 4 and ents[0].end == 5:
assert ents[0].orth_ == '%s%s%s' % (num, space, var) assert ents[0].orth_ == '%s%s%s' % (num, space, var)

View File

@ -17,6 +17,7 @@ def lemmas(tagged):
return [t.lemma_ for t in tagged] return [t.lemma_ for t in tagged]
@pytest.mark.models
def test_lemmas(lemmas, tagged): def test_lemmas(lemmas, tagged):
assert lemmas[0] == 'banana' assert lemmas[0] == 'banana'
assert lemmas[1] == 'in' assert lemmas[1] == 'in'

View File

@ -12,6 +12,7 @@ def morph_exc():
} }
@pytest.mark.models
def test_load_exc(morph_exc): def test_load_exc(morph_exc):
# Do this local as we want to modify it # Do this local as we want to modify it
nlp = English() nlp = English()

View File

@ -1,7 +1,9 @@
from spacy.en import English from spacy.en import English
import six import six
import pytest
@pytest.mark.models
def test_tag_names(EN): def test_tag_names(EN):
tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True) tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
pizza = tokens[2] pizza = tokens[2]

View File

@ -1,8 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""Sphinx doctest is just too hard. Manually paste doctest examples here""" """Sphinx doctest is just too hard. Manually paste doctest examples here"""
from spacy.en.attrs import IS_LOWER from spacy.en.attrs import IS_LOWER
import pytest
@pytest.mark.models
def test_1(): def test_1():
import spacy.en import spacy.en
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV
@ -21,6 +22,7 @@ def test_1():
assert o == -11.07155704498291 assert o == -11.07155704498291
@pytest.mark.models
def test2(): def test2():
import spacy.en import spacy.en
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV
@ -41,6 +43,7 @@ def test2():
-11.07155704498291 -11.07155704498291
@pytest.mark.models
def test3(): def test3():
import spacy.en import spacy.en
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV

View File

@ -15,6 +15,7 @@ def test_attr_of_token(EN):
assert feats_array[0][0] != feats_array[0][1] assert feats_array[0][0] != feats_array[0][1]
@pytest.mark.models
def test_tag(EN): def test_tag(EN):
text = u'A nice sentence.' text = u'A nice sentence.'
tokens = EN(text) tokens = EN(text)
@ -26,6 +27,7 @@ def test_tag(EN):
assert feats_array[3][1] == tokens[3].tag assert feats_array[3][1] == tokens[3].tag
@pytest.mark.models
def test_dep(EN): def test_dep(EN):
text = u'A nice sentence.' text = u'A nice sentence.'
tokens = EN(text) tokens = EN(text)

View File

@ -4,6 +4,7 @@ import pytest
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV
@pytest.mark.models
def test_prob(EN): def test_prob(EN):
tokens = EN(u'Give it back', parse=False) tokens = EN(u'Give it back', parse=False)
give = tokens[0] give = tokens[0]

View File

@ -7,6 +7,7 @@ from spacy.en.attrs import IS_STOP
import pytest import pytest
@pytest.mark.models
def test_strings(EN): def test_strings(EN):
tokens = EN(u'Give it back! He pleaded.') tokens = EN(u'Give it back! He pleaded.')
token = tokens[0] token = tokens[0]

View File

@ -9,6 +9,7 @@ data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
# Let this have its own instances, as we have to be careful about memory here # Let this have its own instances, as we have to be careful about memory here
# that's the point, after all # that's the point, after all
@pytest.mark.models
def get_orphan_token(text, i): def get_orphan_token(text, i):
nlp = English(load_vectors=False, data_dir=data_dir) nlp = English(load_vectors=False, data_dir=data_dir)
tokens = nlp(text) tokens = nlp(text)
@ -18,6 +19,7 @@ def get_orphan_token(text, i):
return token return token
@pytest.mark.models
def test_orphan(): def test_orphan():
orphan = get_orphan_token('An orphan token', 1) orphan = get_orphan_token('An orphan token', 1)
gc.collect() gc.collect()
@ -36,6 +38,7 @@ def _orphan_from_list(toks):
return lst return lst
@pytest.mark.models
def test_list_orphans(): def test_list_orphans():
# Test case from NSchrading # Test case from NSchrading
nlp = English(load_vectors=False, data_dir=data_dir) nlp = English(load_vectors=False, data_dir=data_dir)

View File

@ -5,7 +5,7 @@ from spacy.tokens import Doc
import pytest import pytest
def test_getitem(EN): def mest_getitem(EN):
tokens = EN(u'Give it back! He pleaded.') tokens = EN(u'Give it back! He pleaded.')
assert tokens[0].orth_ == 'Give' assert tokens[0].orth_ == 'Give'
assert tokens[-1].orth_ == '.' assert tokens[-1].orth_ == '.'
@ -13,10 +13,19 @@ def test_getitem(EN):
tokens[len(tokens)] tokens[len(tokens)]
def test_serialize(EN): def mest_serialize(EN):
tokens = EN(u' Give it back! He pleaded. ') tokens = EN(u'Give it back! He pleaded.')
packed = tokens.serialize() packed = tokens.to_bytes()
new_tokens = Doc.deserialize(EN.vocab, packed) new_tokens = Doc(EN.vocab).from_bytes(packed)
assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
def test_serialize_whitespace(EN):
tokens = EN(u' Give it back! He pleaded. ')
packed = tokens.to_bytes()
new_tokens = Doc(EN.vocab).from_bytes(packed)
assert tokens.string == new_tokens.string assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens] assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens]

View File

@ -4,13 +4,14 @@ from spacy.en import English
import pytest import pytest
@pytest.mark.vectors
def test_vec(EN): def test_vec(EN):
hype = EN.vocab['hype'] hype = EN.vocab['hype']
assert hype.orth_ == 'hype' assert hype.orth_ == 'hype'
assert 0.08 >= hype.repvec[0] > 0.07 assert 0.08 >= hype.repvec[0] > 0.07
@pytest.mark.vectors
def test_capitalized(EN): def test_capitalized(EN):
hype = EN.vocab['Hype'] hype = EN.vocab['Hype']
assert hype.orth_ == 'Hype' assert hype.orth_ == 'Hype'

View File

@ -39,7 +39,7 @@ def test_retrieve_id(sstore):
def test_med_string(sstore): def test_med_string(sstore):
nine_char_string = sstore[b'0123456789'] nine_char_string = sstore[b'0123456789']
assert sstore[nine_char_string] == b'0123456789' assert sstore[nine_char_string] == u'0123456789'
dummy = sstore[b'A'] dummy = sstore[b'A']
assert sstore[b'0123456789'] == nine_char_string assert sstore[b'0123456789'] == nine_char_string