mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'master' of https://github.com/honnibal/spaCy
This commit is contained in:
		
						commit
						2e6a60eaec
					
				
							
								
								
									
										17
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								.travis.yml
									
									
									
									
									
								
							| 
						 | 
					@ -11,11 +11,18 @@ python:
 | 
				
			||||||
# install dependencies
 | 
					# install dependencies
 | 
				
			||||||
install:
 | 
					install:
 | 
				
			||||||
  - "pip install --upgrade setuptools"
 | 
					  - "pip install --upgrade setuptools"
 | 
				
			||||||
  - "rm -rf spacy/"
 | 
					  - "pip install cython fabric fabtools"
 | 
				
			||||||
  - "pip install spacy"
 | 
					  - "pip install -r requirements.txt"
 | 
				
			||||||
 | 
					  - "python setup.py build_ext --inplace"
 | 
				
			||||||
 | 
					  - "mkdir -p corpora/en"
 | 
				
			||||||
 | 
					  - "cd corpora/en"
 | 
				
			||||||
 | 
					  - "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
 | 
				
			||||||
 | 
					  - "tar -xzf WordNet-3.0.tar.gz"
 | 
				
			||||||
 | 
					  - "mv WordNet-3.0 wordnet"
 | 
				
			||||||
 | 
					  - "cd ../../"
 | 
				
			||||||
 | 
					  - "export PYTHONPATH=`pwd`"
 | 
				
			||||||
 | 
					  - "python bin/init_model.py lang_data/en corpora/en spacy/en/data"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# run tests
 | 
					# run tests
 | 
				
			||||||
script:
 | 
					script:
 | 
				
			||||||
  - py.test tests/tokenizer/
 | 
					  - "py.test tests/ -x"
 | 
				
			||||||
  - py.test tests/vocab/
 | 
					 | 
				
			||||||
  - py.test tests/tagger/
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										27
									
								
								bin/gather_freqs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								bin/gather_freqs.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,27 @@
 | 
				
			||||||
 | 
					import plac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main(in_loc, out_loc):
 | 
				
			||||||
 | 
					    out_file = open(out_loc, 'w')
 | 
				
			||||||
 | 
					    this_key = None
 | 
				
			||||||
 | 
					    this_freq = 0
 | 
				
			||||||
 | 
					    df = 0
 | 
				
			||||||
 | 
					    for line in open(in_loc):
 | 
				
			||||||
 | 
					        line = line.strip()
 | 
				
			||||||
 | 
					        if not line:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        freq, key = line.split('\t', 1)
 | 
				
			||||||
 | 
					        freq = int(freq)
 | 
				
			||||||
 | 
					        if this_key is not None and key != this_key:
 | 
				
			||||||
 | 
					            out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
 | 
				
			||||||
 | 
					            this_key = key
 | 
				
			||||||
 | 
					            this_freq = freq
 | 
				
			||||||
 | 
					            df = 1
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            this_freq += freq
 | 
				
			||||||
 | 
					            df += 1
 | 
				
			||||||
 | 
					    out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
 | 
				
			||||||
 | 
					    out_file.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    plac.call(main)
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,8 @@ Requires:
 | 
				
			||||||
        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
 | 
					        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
 | 
				
			||||||
        * vectors.tgz --- output of something like word2vec
 | 
					        * vectors.tgz --- output of something like word2vec
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _read_clusters(loc):
 | 
					def _read_clusters(loc):
 | 
				
			||||||
    if not loc.exists():
 | 
					    if not loc.exists():
 | 
				
			||||||
        print "Warning: Clusters file not found"
 | 
					        print("Warning: Clusters file not found")
 | 
				
			||||||
        return {}
 | 
					        return {}
 | 
				
			||||||
    clusters = {}
 | 
					    clusters = {}
 | 
				
			||||||
    for line in codecs.open(str(loc), 'r', 'utf8'):
 | 
					    for line in codecs.open(str(loc), 'r', 'utf8'):
 | 
				
			||||||
| 
						 | 
					@ -60,7 +62,7 @@ def _read_clusters(loc):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            clusters[word] = '0'
 | 
					            clusters[word] = '0'
 | 
				
			||||||
    # Expand clusters with re-casing
 | 
					    # Expand clusters with re-casing
 | 
				
			||||||
    for word, cluster in clusters.items():
 | 
					    for word, cluster in list(clusters.items()):
 | 
				
			||||||
        if word.lower() not in clusters:
 | 
					        if word.lower() not in clusters:
 | 
				
			||||||
            clusters[word.lower()] = cluster
 | 
					            clusters[word.lower()] = cluster
 | 
				
			||||||
        if word.title() not in clusters:
 | 
					        if word.title() not in clusters:
 | 
				
			||||||
| 
						 | 
					@ -72,7 +74,7 @@ def _read_clusters(loc):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _read_probs(loc):
 | 
					def _read_probs(loc):
 | 
				
			||||||
    if not loc.exists():
 | 
					    if not loc.exists():
 | 
				
			||||||
        print "Warning: Probabilities file not found"
 | 
					        print("Warning: Probabilities file not found")
 | 
				
			||||||
        return {}
 | 
					        return {}
 | 
				
			||||||
    probs = {}
 | 
					    probs = {}
 | 
				
			||||||
    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
 | 
					    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
 | 
				
			||||||
| 
						 | 
					@ -85,7 +87,7 @@ def _read_probs(loc):
 | 
				
			||||||
def _read_senses(loc):
 | 
					def _read_senses(loc):
 | 
				
			||||||
    lexicon = defaultdict(lambda: defaultdict(list))
 | 
					    lexicon = defaultdict(lambda: defaultdict(list))
 | 
				
			||||||
    if not loc.exists():
 | 
					    if not loc.exists():
 | 
				
			||||||
        print "Warning: WordNet senses not found"
 | 
					        print("Warning: WordNet senses not found")
 | 
				
			||||||
        return lexicon
 | 
					        return lexicon
 | 
				
			||||||
    sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
 | 
					    sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
 | 
				
			||||||
    pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
 | 
					    pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
 | 
				
			||||||
| 
						 | 
					@ -109,13 +111,20 @@ def setup_vocab(src_dir, dst_dir):
 | 
				
			||||||
    if vectors_src.exists():
 | 
					    if vectors_src.exists():
 | 
				
			||||||
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
 | 
					        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        print "Warning: Word vectors file not found"
 | 
					        print("Warning: Word vectors file not found")
 | 
				
			||||||
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
 | 
					    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
 | 
				
			||||||
    clusters = _read_clusters(src_dir / 'clusters.txt')
 | 
					    clusters = _read_clusters(src_dir / 'clusters.txt')
 | 
				
			||||||
    probs = _read_probs(src_dir / 'words.sgt.prob')
 | 
					    probs = _read_probs(src_dir / 'words.sgt.prob')
 | 
				
			||||||
    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
 | 
					    if not probs:
 | 
				
			||||||
 | 
					        min_prob = 0.0
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        min_prob = min(probs.values())
 | 
				
			||||||
 | 
					    for word in clusters:
 | 
				
			||||||
 | 
					        if word not in probs:
 | 
				
			||||||
 | 
					            probs[word] = min_prob
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    lexicon = []
 | 
					    lexicon = []
 | 
				
			||||||
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
 | 
					    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
 | 
				
			||||||
        entry = get_lex_props(word)
 | 
					        entry = get_lex_props(word)
 | 
				
			||||||
        if word in clusters or float(prob) >= -17:
 | 
					        if word in clusters or float(prob) >= -17:
 | 
				
			||||||
            entry['prob'] = float(prob)
 | 
					            entry['prob'] = float(prob)
 | 
				
			||||||
| 
						 | 
					@ -144,7 +153,7 @@ def main(lang_data_dir, corpora_dir, model_dir):
 | 
				
			||||||
    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
 | 
					    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
 | 
				
			||||||
    setup_vocab(corpora_dir, model_dir / 'vocab')
 | 
					    setup_vocab(corpora_dir, model_dir / 'vocab')
 | 
				
			||||||
    if not (model_dir / 'wordnet').exists():
 | 
					    if not (model_dir / 'wordnet').exists():
 | 
				
			||||||
        copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
 | 
					        copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					#!/usr/bin/env python
 | 
				
			||||||
from __future__ import division
 | 
					from __future__ import division
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					from __future__ import print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
| 
						 | 
					@ -107,7 +108,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    nlp = Language(data_dir=model_dir)
 | 
					    nlp = Language(data_dir=model_dir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
 | 
					    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
 | 
				
			||||||
    for itn in range(n_iter):
 | 
					    for itn in range(n_iter):
 | 
				
			||||||
        scorer = Scorer()
 | 
					        scorer = Scorer()
 | 
				
			||||||
        loss = 0
 | 
					        loss = 0
 | 
				
			||||||
| 
						 | 
					@ -138,9 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 | 
				
			||||||
                nlp.entity.train(tokens, gold)
 | 
					                nlp.entity.train(tokens, gold)
 | 
				
			||||||
                nlp.tagger.train(tokens, gold.tags)
 | 
					                nlp.tagger.train(tokens, gold.tags)
 | 
				
			||||||
        random.shuffle(gold_tuples)
 | 
					        random.shuffle(gold_tuples)
 | 
				
			||||||
        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
 | 
					        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
 | 
				
			||||||
                                               scorer.tags_acc,
 | 
					                                                   scorer.tags_acc,
 | 
				
			||||||
                                               scorer.token_acc)
 | 
					                                                   scorer.token_acc))
 | 
				
			||||||
    nlp.end_training()
 | 
					    nlp.end_training()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
 | 
					def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
 | 
				
			||||||
| 
						 | 
					@ -219,14 +220,14 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
 | 
				
			||||||
    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
 | 
					    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
 | 
				
			||||||
    scorer = evaluate(English, list(read_json_file(dev_loc)),
 | 
					    scorer = evaluate(English, list(read_json_file(dev_loc)),
 | 
				
			||||||
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
 | 
					                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
 | 
				
			||||||
    print 'TOK', scorer.token_acc
 | 
					    print('TOK', scorer.token_acc)
 | 
				
			||||||
    print 'POS', scorer.tags_acc
 | 
					    print('POS', scorer.tags_acc)
 | 
				
			||||||
    print 'UAS', scorer.uas
 | 
					    print('UAS', scorer.uas)
 | 
				
			||||||
    print 'LAS', scorer.las
 | 
					    print('LAS', scorer.las)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print 'NER P', scorer.ents_p
 | 
					    print('NER P', scorer.ents_p)
 | 
				
			||||||
    print 'NER R', scorer.ents_r
 | 
					    print('NER R', scorer.ents_r)
 | 
				
			||||||
    print 'NER F', scorer.ents_f
 | 
					    print('NER F', scorer.ents_f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										316709
									
								
								corpora/en/clusters.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										316709
									
								
								corpora/en/clusters.txt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
					@ -2,7 +2,7 @@ cython
 | 
				
			||||||
cymem == 1.11
 | 
					cymem == 1.11
 | 
				
			||||||
pathlib
 | 
					pathlib
 | 
				
			||||||
preshed == 0.37
 | 
					preshed == 0.37
 | 
				
			||||||
thinc == 3.2
 | 
					thinc == 3.3
 | 
				
			||||||
murmurhash == 0.24
 | 
					murmurhash == 0.24
 | 
				
			||||||
unidecode
 | 
					unidecode
 | 
				
			||||||
numpy
 | 
					numpy
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										3
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -120,7 +120,7 @@ def run_setup(exts):
 | 
				
			||||||
        ext_modules=exts,
 | 
					        ext_modules=exts,
 | 
				
			||||||
        license="Dual: Commercial or AGPL",
 | 
					        license="Dual: Commercial or AGPL",
 | 
				
			||||||
        install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
 | 
					        install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
 | 
				
			||||||
                          'thinc == 3.2', "unidecode", 'wget', 'plac', 'six',
 | 
					                          'thinc == 3.3', "unidecode", 'wget', 'plac', 'six',
 | 
				
			||||||
                          'ujson'],
 | 
					                          'ujson'],
 | 
				
			||||||
        setup_requires=["headers_workaround"],
 | 
					        setup_requires=["headers_workaround"],
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
| 
						 | 
					@ -162,6 +162,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
 | 
				
			||||||
             'spacy.gold', 'spacy.orth',
 | 
					             'spacy.gold', 'spacy.orth',
 | 
				
			||||||
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
 | 
					             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
 | 
				
			||||||
             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
 | 
					             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
 | 
				
			||||||
 | 
					             'spacy.cfile',
 | 
				
			||||||
             'spacy.syntax.ner']
 | 
					             'spacy.syntax.ner']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										12
									
								
								spacy/cfile.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/cfile.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,12 @@
 | 
				
			||||||
 | 
					from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 | 
				
			||||||
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class CFile:
 | 
				
			||||||
 | 
					    cdef FILE* fp
 | 
				
			||||||
 | 
					    cdef bint is_open
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
 | 
				
			||||||
							
								
								
									
										40
									
								
								spacy/cfile.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								spacy/cfile.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,40 @@
 | 
				
			||||||
 | 
					from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class CFile:
 | 
				
			||||||
 | 
					    def __init__(self, loc, mode):
 | 
				
			||||||
 | 
					        if isinstance(mode, unicode):
 | 
				
			||||||
 | 
					            mode_str = mode.encode('ascii')
 | 
				
			||||||
 | 
					        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
				
			||||||
 | 
					        self.fp = fopen(<char*>bytes_loc, mode_str)
 | 
				
			||||||
 | 
					        if self.fp == NULL:
 | 
				
			||||||
 | 
					            raise IOError("Could not open binary file %s" % bytes_loc)
 | 
				
			||||||
 | 
					        self.is_open = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __dealloc__(self):
 | 
				
			||||||
 | 
					        if self.is_open:
 | 
				
			||||||
 | 
					            fclose(self.fp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def close(self):
 | 
				
			||||||
 | 
					        fclose(self.fp)
 | 
				
			||||||
 | 
					        self.is_open = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
 | 
				
			||||||
 | 
					        st = fread(dest, elem_size, number, self.fp)
 | 
				
			||||||
 | 
					        if st != number:
 | 
				
			||||||
 | 
					            raise IOError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
 | 
				
			||||||
 | 
					        st = fwrite(src, elem_size, number, self.fp)
 | 
				
			||||||
 | 
					        if st != number:
 | 
				
			||||||
 | 
					            raise IOError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
 | 
				
			||||||
 | 
					        cdef void* dest = mem.alloc(number, elem_size)
 | 
				
			||||||
 | 
					        self.read_into(dest, number, elem_size)
 | 
				
			||||||
 | 
					        return dest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def write_unicode(self, unicode value):
 | 
				
			||||||
 | 
					        cdef bytes py_bytes = value.encode('utf8')
 | 
				
			||||||
 | 
					        cdef char* chars = <char*>py_bytes
 | 
				
			||||||
 | 
					        self.write(sizeof(char), len(py_bytes), chars)
 | 
				
			||||||
| 
						 | 
					@ -95,15 +95,15 @@ class English(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
 | 
					        self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        if Tagger:
 | 
					        if Tagger and path.exists(path.join(data_dir, 'pos')):
 | 
				
			||||||
            self.tagger = Tagger(self.vocab.strings, data_dir)
 | 
					            self.tagger = Tagger(self.vocab.strings, data_dir)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.tagger = None
 | 
					            self.tagger = None
 | 
				
			||||||
        if Parser:
 | 
					        if Parser and path.exists(path.join(data_dir, 'deps')):
 | 
				
			||||||
            self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
 | 
					            self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.parser = None
 | 
					            self.parser = None
 | 
				
			||||||
        if Entity:
 | 
					        if Entity and path.exists(path.join(data_dir, 'ner')):
 | 
				
			||||||
            self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
 | 
					            self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.entity = None
 | 
					            self.entity = None
 | 
				
			||||||
| 
						 | 
					@ -153,15 +153,14 @@ class English(object):
 | 
				
			||||||
        self.tagger.model.end_training()
 | 
					        self.tagger.model.end_training()
 | 
				
			||||||
        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
 | 
					        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        packer = Packer(self.vocab, [
 | 
					        with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
 | 
				
			||||||
            (TAG, self.tagger.moves.freqs[TAG].items()),
 | 
					            file_.write(
 | 
				
			||||||
            (HEAD, self.parser.moves.freqs[HEAD].items()),
 | 
					                json.dumps([
 | 
				
			||||||
            (DEP, self.parser.moves.freqs[DEP].items()),
 | 
					                    (TAG, self.tagger.freqs[TAG].items()),
 | 
				
			||||||
            (ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
 | 
					                    (DEP, self.parser.moves.freqs[DEP].items()),
 | 
				
			||||||
            (ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items())
 | 
					                    (ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
 | 
				
			||||||
        ])
 | 
					                    (ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()),
 | 
				
			||||||
 | 
					                    (HEAD, self.parser.moves.freqs[HEAD].items())]))
 | 
				
			||||||
        packer.dump(path.join(data_dir, 'vocab'))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def tags(self):
 | 
					    def tags(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,6 +14,9 @@ from ..attrs cimport LEMMA as _LEMMA
 | 
				
			||||||
from ..attrs cimport POS as _POS
 | 
					from ..attrs cimport POS as _POS
 | 
				
			||||||
from ..attrs cimport TAG as _TAG
 | 
					from ..attrs cimport TAG as _TAG
 | 
				
			||||||
from ..attrs cimport DEP as _DEP
 | 
					from ..attrs cimport DEP as _DEP
 | 
				
			||||||
 | 
					from ..attrs cimport HEAD as _HEAD
 | 
				
			||||||
 | 
					from ..attrs cimport ENT_IOB as _ENT_IOB
 | 
				
			||||||
 | 
					from ..attrs cimport ENT_TYPE as _ENT_TYPE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef enum:
 | 
					cpdef enum:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -262,6 +262,9 @@ cdef class EnPosTagger:
 | 
				
			||||||
                                                 'morphs.json'))))
 | 
					                                                 'morphs.json'))))
 | 
				
			||||||
        self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
 | 
					        self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
 | 
				
			||||||
        self.freqs = {TAG: defaultdict(int)}
 | 
					        self.freqs = {TAG: defaultdict(int)}
 | 
				
			||||||
 | 
					        for tag in self.tag_names:
 | 
				
			||||||
 | 
					            self.freqs[TAG][self.strings[tag]] = 1
 | 
				
			||||||
 | 
					        self.freqs[TAG][0] = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, Doc tokens):
 | 
					    def __call__(self, Doc tokens):
 | 
				
			||||||
        """Apply the tagger, setting the POS tags onto the Doc object.
 | 
					        """Apply the tagger, setting the POS tags onto the Doc object.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,8 +3,6 @@ from cymem.cymem cimport Pool
 | 
				
			||||||
from .structs cimport TokenC
 | 
					from .structs cimport TokenC
 | 
				
			||||||
from .syntax.transition_system cimport Transition
 | 
					from .syntax.transition_system cimport Transition
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cimport numpy
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef struct GoldParseC:
 | 
					cdef struct GoldParseC:
 | 
				
			||||||
    int* tags
 | 
					    int* tags
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,5 @@
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import codecs
 | 
					import codecs
 | 
				
			||||||
import json
 | 
					 | 
				
			||||||
import ujson
 | 
					 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
| 
						 | 
					@ -9,6 +7,11 @@ from os import path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					try:
 | 
				
			||||||
 | 
					    import ujson as json
 | 
				
			||||||
 | 
					except ImportError:
 | 
				
			||||||
 | 
					    import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def tags_to_entities(tags):
 | 
					def tags_to_entities(tags):
 | 
				
			||||||
    entities = []
 | 
					    entities = []
 | 
				
			||||||
| 
						 | 
					@ -128,7 +131,7 @@ def read_json_file(loc, docs_filter=None):
 | 
				
			||||||
            yield from read_json_file(path.join(loc, filename))
 | 
					            yield from read_json_file(path.join(loc, filename))
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        with open(loc) as file_:
 | 
					        with open(loc) as file_:
 | 
				
			||||||
            docs = ujson.load(file_)
 | 
					            docs = json.load(file_)
 | 
				
			||||||
        for doc in docs:
 | 
					        for doc in docs:
 | 
				
			||||||
            if docs_filter is not None and not docs_filter(doc):
 | 
					            if docs_filter is not None and not docs_filter(doc):
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class BitArray:
 | 
					cdef class BitArray:
 | 
				
			||||||
    cdef bytes data
 | 
					    cdef bytearray data
 | 
				
			||||||
    cdef uchar byte
 | 
					    cdef uchar byte
 | 
				
			||||||
    cdef uchar bit_of_byte
 | 
					    cdef uchar bit_of_byte
 | 
				
			||||||
    cdef uint32_t i
 | 
					    cdef uint32_t i
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,5 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.string cimport memcpy
 | 
					from libc.string cimport memcpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Note that we're setting the most significant bits here first, when in practice
 | 
					# Note that we're setting the most significant bits here first, when in practice
 | 
				
			||||||
| 
						 | 
					@ -15,7 +17,7 @@ cdef Code bit_append(Code code, bint bit) nogil:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class BitArray:
 | 
					cdef class BitArray:
 | 
				
			||||||
    def __init__(self, data=b''):
 | 
					    def __init__(self, data=b''):
 | 
				
			||||||
        self.data = data
 | 
					        self.data = bytearray(data)
 | 
				
			||||||
        self.byte = 0
 | 
					        self.byte = 0
 | 
				
			||||||
        self.bit_of_byte = 0
 | 
					        self.bit_of_byte = 0
 | 
				
			||||||
        self.i = 0
 | 
					        self.i = 0
 | 
				
			||||||
| 
						 | 
					@ -45,7 +47,7 @@ cdef class BitArray:
 | 
				
			||||||
        start_bit = self.i % 8
 | 
					        start_bit = self.i % 8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if start_bit != 0 and start_byte < len(self.data):
 | 
					        if start_bit != 0 and start_byte < len(self.data):
 | 
				
			||||||
            byte = ord(self.data[start_byte])
 | 
					            byte = self.data[start_byte]
 | 
				
			||||||
            for i in range(start_bit, 8):
 | 
					            for i in range(start_bit, 8):
 | 
				
			||||||
                self.i += 1
 | 
					                self.i += 1
 | 
				
			||||||
                yield 1 if (byte & (one << i)) else 0
 | 
					                yield 1 if (byte & (one << i)) else 0
 | 
				
			||||||
| 
						 | 
					@ -68,18 +70,24 @@ cdef class BitArray:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # TODO portability
 | 
					        # TODO portability
 | 
				
			||||||
        cdef uchar[4] chars
 | 
					        cdef uchar[4] chars
 | 
				
			||||||
        chars[0] = <uchar>ord(self.data[start_byte])
 | 
					        chars[0] = self.data[start_byte]
 | 
				
			||||||
        chars[1] = <uchar>ord(self.data[start_byte+1])
 | 
					        chars[1] = self.data[start_byte+1]
 | 
				
			||||||
        chars[2] = <uchar>ord(self.data[start_byte+2])
 | 
					        chars[2] = self.data[start_byte+2]
 | 
				
			||||||
        chars[3] = <uchar>ord(self.data[start_byte+3])
 | 
					        chars[3] = self.data[start_byte+3]
 | 
				
			||||||
        cdef uint32_t output
 | 
					        cdef uint32_t output
 | 
				
			||||||
        memcpy(&output, chars, 4)
 | 
					        memcpy(&output, chars, 4)
 | 
				
			||||||
        self.i += 32
 | 
					        self.i += 32
 | 
				
			||||||
        return output
 | 
					        return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def as_bytes(self):
 | 
					    def as_bytes(self):
 | 
				
			||||||
 | 
					        cdef unsigned char byte_char
 | 
				
			||||||
        if self.bit_of_byte != 0:
 | 
					        if self.bit_of_byte != 0:
 | 
				
			||||||
            return self.data + chr(self.byte)
 | 
					            byte = chr(self.byte)
 | 
				
			||||||
 | 
					            # Jump through some hoops for Python3
 | 
				
			||||||
 | 
					            if isinstance(byte, unicode):
 | 
				
			||||||
 | 
					                return self.data + <bytes>(&self.byte)[:1]
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                return self.data + chr(self.byte)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return self.data
 | 
					            return self.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -92,7 +100,7 @@ cdef class BitArray:
 | 
				
			||||||
        self.bit_of_byte += 1
 | 
					        self.bit_of_byte += 1
 | 
				
			||||||
        self.i += 1
 | 
					        self.i += 1
 | 
				
			||||||
        if self.bit_of_byte == 8:
 | 
					        if self.bit_of_byte == 8:
 | 
				
			||||||
            self.data += chr(self.byte)
 | 
					            self.data += bytearray((self.byte,))
 | 
				
			||||||
            self.byte = 0
 | 
					            self.byte = 0
 | 
				
			||||||
            self.bit_of_byte = 0
 | 
					            self.bit_of_byte = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -106,7 +114,7 @@ cdef class BitArray:
 | 
				
			||||||
                self.byte &= ~(one << self.bit_of_byte)
 | 
					                self.byte &= ~(one << self.bit_of_byte)
 | 
				
			||||||
            self.bit_of_byte += 1
 | 
					            self.bit_of_byte += 1
 | 
				
			||||||
            if self.bit_of_byte == 8:
 | 
					            if self.bit_of_byte == 8:
 | 
				
			||||||
                self.data += chr(self.byte)
 | 
					                self.data += <bytes>self.byte
 | 
				
			||||||
                self.byte = 0
 | 
					                self.byte = 0
 | 
				
			||||||
                self.bit_of_byte = 0
 | 
					                self.bit_of_byte = 0
 | 
				
			||||||
            self.i += 1
 | 
					            self.i += 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: profile=True
 | 
					# cython: profile=True
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
from libcpp.queue cimport priority_queue
 | 
					from libcpp.queue cimport priority_queue
 | 
				
			||||||
from libcpp.pair cimport pair
 | 
					from libcpp.pair cimport pair
 | 
				
			||||||
| 
						 | 
					@ -110,14 +111,14 @@ cdef class HuffmanCodec:
 | 
				
			||||||
        cdef int branch
 | 
					        cdef int branch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef int n_msg = msg.shape[0]
 | 
					        cdef int n_msg = msg.shape[0]
 | 
				
			||||||
        cdef bytes bytes_ = bits.as_bytes()
 | 
					        cdef bytearray bytes_ = bits.as_bytes()
 | 
				
			||||||
        cdef unsigned char byte
 | 
					        cdef unsigned char byte
 | 
				
			||||||
        cdef int i_msg = 0
 | 
					        cdef int i_msg = 0
 | 
				
			||||||
        cdef int i_byte = bits.i // 8
 | 
					        cdef int i_byte = bits.i // 8
 | 
				
			||||||
        cdef unsigned char i_bit = 0
 | 
					        cdef unsigned char i_bit = 0
 | 
				
			||||||
        cdef unsigned char one = 1
 | 
					        cdef unsigned char one = 1
 | 
				
			||||||
        while i_msg < n_msg:
 | 
					        while i_msg < n_msg:
 | 
				
			||||||
            byte = ord(bytes_[i_byte])
 | 
					            byte = bytes_[i_byte]
 | 
				
			||||||
            i_byte += 1
 | 
					            i_byte += 1
 | 
				
			||||||
            for i_bit in range(8):
 | 
					            for i_bit in range(8):
 | 
				
			||||||
                branch = node.right if (byte & (one << i_bit)) else node.left
 | 
					                branch = node.right if (byte & (one << i_bit)) else node.left
 | 
				
			||||||
| 
						 | 
					@ -138,11 +139,11 @@ cdef class HuffmanCodec:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            output = []
 | 
					            output = []
 | 
				
			||||||
            cdef int i, j
 | 
					            cdef int i, j
 | 
				
			||||||
            cdef bytes string
 | 
					            cdef unicode string
 | 
				
			||||||
            cdef Code code
 | 
					            cdef Code code
 | 
				
			||||||
            for i in range(self.codes.size()):
 | 
					            for i in range(self.codes.size()):
 | 
				
			||||||
                code = self.codes[i]
 | 
					                code = self.codes[i]
 | 
				
			||||||
                string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
 | 
					                string = '{0:b}'.format(code.bits).rjust(code.length, '0')
 | 
				
			||||||
                string = string[::-1]
 | 
					                string = string[::-1]
 | 
				
			||||||
                output.append(string)
 | 
					                output.append(string)
 | 
				
			||||||
            return output
 | 
					            return output
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,6 +10,7 @@ from libcpp.pair cimport pair
 | 
				
			||||||
from cymem.cymem cimport Address, Pool
 | 
					from cymem.cymem cimport Address, Pool
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
from preshed.counter cimport PreshCounter
 | 
					from preshed.counter cimport PreshCounter
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 | 
					from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
| 
						 | 
					@ -65,7 +66,7 @@ def _gen_orths(Vocab vocab):
 | 
				
			||||||
def _gen_chars(Vocab vocab):
 | 
					def _gen_chars(Vocab vocab):
 | 
				
			||||||
    cdef attr_t orth
 | 
					    cdef attr_t orth
 | 
				
			||||||
    cdef size_t addr
 | 
					    cdef size_t addr
 | 
				
			||||||
    char_weights = {chr(i): 1e-20 for i in range(256)}
 | 
					    char_weights = {i: 1e-20 for i in range(256)}
 | 
				
			||||||
    cdef unicode string
 | 
					    cdef unicode string
 | 
				
			||||||
    cdef bytes char
 | 
					    cdef bytes char
 | 
				
			||||||
    cdef bytes utf8_str
 | 
					    cdef bytes utf8_str
 | 
				
			||||||
| 
						 | 
					@ -74,9 +75,9 @@ def _gen_chars(Vocab vocab):
 | 
				
			||||||
        string = vocab.strings[lex.orth]
 | 
					        string = vocab.strings[lex.orth]
 | 
				
			||||||
        utf8_str = string.encode('utf8')
 | 
					        utf8_str = string.encode('utf8')
 | 
				
			||||||
        for char in utf8_str:
 | 
					        for char in utf8_str:
 | 
				
			||||||
            char_weights.setdefault(char, 0.0)
 | 
					            char_weights.setdefault(ord(char), 0.0)
 | 
				
			||||||
            char_weights[char] += c_exp(lex.prob)
 | 
					            char_weights[ord(char)] += c_exp(lex.prob)
 | 
				
			||||||
        char_weights[b' '] += c_exp(lex.prob)
 | 
					        char_weights[ord(' ')] += c_exp(lex.prob)
 | 
				
			||||||
    return char_weights.items()
 | 
					    return char_weights.items()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -98,33 +99,34 @@ cdef class Packer:
 | 
				
			||||||
        self._codecs = tuple(codecs)
 | 
					        self._codecs = tuple(codecs)
 | 
				
			||||||
        self.attrs = tuple(attrs)
 | 
					        self.attrs = tuple(attrs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def from_dir(cls, Vocab vocab, data_dir):
 | 
					 | 
				
			||||||
        return cls(vocab, util.read_encoding_freqs(data_dir))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def pack(self, Doc doc):
 | 
					    def pack(self, Doc doc):
 | 
				
			||||||
        bits = self._orth_encode(doc)
 | 
					        bits = self._orth_encode(doc)
 | 
				
			||||||
        if bits is None:
 | 
					        if bits is None:
 | 
				
			||||||
            bits = self._char_encode(doc)
 | 
					            bits = self._char_encode(doc)
 | 
				
			||||||
        
 | 
					 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
        if self.attrs:
 | 
					        if self.attrs:
 | 
				
			||||||
            array = doc.to_array(self.attrs)
 | 
					            array = doc.to_array(self.attrs)
 | 
				
			||||||
            for i, codec in enumerate(self._codecs):
 | 
					            for i, codec in enumerate(self._codecs):
 | 
				
			||||||
                codec.encode_int32(array[:, i], bits)
 | 
					                codec.encode(array[:, i], bits)
 | 
				
			||||||
        return bits
 | 
					        return bits.as_bytes()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def unpack(self, BitArray bits):
 | 
					    def unpack(self, data):
 | 
				
			||||||
 | 
					        doc = Doc(self.vocab)
 | 
				
			||||||
 | 
					        self.unpack_into(data, doc)
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def unpack_into(self, byte_string, Doc doc):
 | 
				
			||||||
 | 
					        bits = BitArray(byte_string)
 | 
				
			||||||
        bits.seek(0)
 | 
					        bits.seek(0)
 | 
				
			||||||
        cdef int32_t length = bits.read32()
 | 
					        cdef int32_t length = bits.read32()
 | 
				
			||||||
        if length >= 0:
 | 
					        if length >= 0:
 | 
				
			||||||
            doc = self._orth_decode(bits, length)
 | 
					            self._orth_decode(bits, length, doc)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            doc = self._char_decode(bits, -length)
 | 
					            self._char_decode(bits, -length, doc)
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
 | 
					        array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
 | 
				
			||||||
        for i, codec in enumerate(self._codecs):
 | 
					        for i, codec in enumerate(self._codecs):
 | 
				
			||||||
            codec.decode_int32(bits, array[:, i])
 | 
					            codec.decode(bits, array[:, i])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        doc.from_array(self.attrs, array)
 | 
					        doc.from_array(self.attrs, array)
 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
| 
						 | 
					@ -141,20 +143,13 @@ cdef class Packer:
 | 
				
			||||||
            bits.append(bool(token.whitespace_))
 | 
					            bits.append(bool(token.whitespace_))
 | 
				
			||||||
        return bits
 | 
					        return bits
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _orth_decode(self, BitArray bits, n):
 | 
					 | 
				
			||||||
        orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
 | 
					 | 
				
			||||||
        self.orth_codec.decode_int32(bits, orths)
 | 
					 | 
				
			||||||
        orths_and_spaces = zip(orths, bits)
 | 
					 | 
				
			||||||
        cdef Doc doc = Doc(self.vocab, orths_and_spaces)
 | 
					 | 
				
			||||||
        return doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _char_encode(self, Doc doc):
 | 
					    def _char_encode(self, Doc doc):
 | 
				
			||||||
        cdef bytes utf8_str = doc.string.encode('utf8')
 | 
					        cdef bytes utf8_str = doc.string.encode('utf8')
 | 
				
			||||||
        cdef BitArray bits = BitArray()
 | 
					        cdef BitArray bits = BitArray()
 | 
				
			||||||
        cdef int32_t length = len(utf8_str)
 | 
					        cdef int32_t length = len(utf8_str)
 | 
				
			||||||
        # Signal chars with negative length
 | 
					        # Signal chars with negative length
 | 
				
			||||||
        bits.extend(-length, 32)
 | 
					        bits.extend(-length, 32)
 | 
				
			||||||
        self.char_codec.encode(utf8_str, bits)
 | 
					        self.char_codec.encode(bytearray(utf8_str), bits)
 | 
				
			||||||
        cdef int i, j
 | 
					        cdef int i, j
 | 
				
			||||||
        for i in range(doc.length):
 | 
					        for i in range(doc.length):
 | 
				
			||||||
            for j in range(doc.data[i].lex.length-1):
 | 
					            for j in range(doc.data[i].lex.length-1):
 | 
				
			||||||
| 
						 | 
					@ -164,12 +159,24 @@ cdef class Packer:
 | 
				
			||||||
                bits.append(False)
 | 
					                bits.append(False)
 | 
				
			||||||
        return bits
 | 
					        return bits
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _char_decode(self, BitArray bits, n):
 | 
					    def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
 | 
				
			||||||
 | 
					        cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
 | 
				
			||||||
 | 
					        self.orth_codec.decode_int32(bits, orths)
 | 
				
			||||||
 | 
					        cdef int i
 | 
				
			||||||
 | 
					        cdef bint space
 | 
				
			||||||
 | 
					        spaces = iter(bits)
 | 
				
			||||||
 | 
					        for i in range(n):
 | 
				
			||||||
 | 
					            orth = orths[i]
 | 
				
			||||||
 | 
					            space = next(spaces)
 | 
				
			||||||
 | 
					            lex = self.vocab.get_by_orth(doc.mem, orth)
 | 
				
			||||||
 | 
					            doc.push_back(lex, space)
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _char_decode(self, BitArray bits, int32_t n, Doc doc):
 | 
				
			||||||
        cdef bytearray utf8_str = bytearray(n)
 | 
					        cdef bytearray utf8_str = bytearray(n)
 | 
				
			||||||
        self.char_codec.decode(bits, utf8_str)
 | 
					        self.char_codec.decode(bits, utf8_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef unicode string = utf8_str.decode('utf8')
 | 
					        cdef unicode string = utf8_str.decode('utf8')
 | 
				
			||||||
        cdef Doc tokens = Doc(self.vocab)
 | 
					 | 
				
			||||||
        cdef int start = 0
 | 
					        cdef int start = 0
 | 
				
			||||||
        cdef bint is_spacy
 | 
					        cdef bint is_spacy
 | 
				
			||||||
        cdef int length = len(string)
 | 
					        cdef int length = len(string)
 | 
				
			||||||
| 
						 | 
					@ -178,11 +185,11 @@ cdef class Packer:
 | 
				
			||||||
        for is_end_token in bits:
 | 
					        for is_end_token in bits:
 | 
				
			||||||
            if is_end_token:
 | 
					            if is_end_token:
 | 
				
			||||||
                span = string[start:i+1]
 | 
					                span = string[start:i+1]
 | 
				
			||||||
                lex = self.vocab.get(tokens.mem, span)
 | 
					                lex = self.vocab.get(doc.mem, span)
 | 
				
			||||||
                is_spacy = (i+1) < length and string[i+1] == u' '
 | 
					                is_spacy = (i+1) < length and string[i+1] == u' '
 | 
				
			||||||
                tokens.push_back(lex, is_spacy)
 | 
					                doc.push_back(lex, is_spacy)
 | 
				
			||||||
                start = i + 1 + is_spacy
 | 
					                start = i + 1 + is_spacy
 | 
				
			||||||
            i += 1
 | 
					            i += 1
 | 
				
			||||||
            if i >= n:
 | 
					            if i >= n:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
        return tokens
 | 
					        return doc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -81,6 +81,7 @@ cdef class StringStore:
 | 
				
			||||||
    def __getitem__(self, object string_or_id):
 | 
					    def __getitem__(self, object string_or_id):
 | 
				
			||||||
        cdef bytes byte_string
 | 
					        cdef bytes byte_string
 | 
				
			||||||
        cdef const Utf8Str* utf8str
 | 
					        cdef const Utf8Str* utf8str
 | 
				
			||||||
 | 
					        cdef int id_
 | 
				
			||||||
        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
 | 
					        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
 | 
				
			||||||
            if string_or_id == 0:
 | 
					            if string_or_id == 0:
 | 
				
			||||||
                return u''
 | 
					                return u''
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
# cython: profile=True
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Fill an array, context, with every _atomic_ value our features reference.
 | 
					Fill an array, context, with every _atomic_ value our features reference.
 | 
				
			||||||
We then write the _actual features_ as tuples of the atoms. The machinery
 | 
					We then write the _actual features_ as tuples of the atoms. The machinery
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
# cython: profile=True
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import ctypes
 | 
					import ctypes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -85,6 +85,9 @@ cdef class BiluoPushDown(TransitionSystem):
 | 
				
			||||||
            elif gold.c.ner[i].move == OUT:
 | 
					            elif gold.c.ner[i].move == OUT:
 | 
				
			||||||
                self.freqs[ENT_IOB][1] += 1
 | 
					                self.freqs[ENT_IOB][1] += 1
 | 
				
			||||||
                self.freqs[ENT_TYPE][0] += 1
 | 
					                self.freqs[ENT_TYPE][0] += 1
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                self.freqs[ENT_IOB][1] += 1
 | 
				
			||||||
 | 
					                self.freqs[ENT_TYPE][0] += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Transition lookup_transition(self, object name) except *:
 | 
					    cdef Transition lookup_transition(self, object name) except *:
 | 
				
			||||||
        if name == '-':
 | 
					        if name == '-':
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
# cython: profile=True
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
MALT-style dependency parser
 | 
					MALT-style dependency parser
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
| 
						 | 
					@ -85,18 +84,17 @@ cdef class Parser:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
 | 
					        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
 | 
				
			||||||
                                  self.model.n_feats, self.model.n_feats)
 | 
					                                  self.model.n_feats, self.model.n_feats)
 | 
				
			||||||
        self.parse(stcls, eg.c)
 | 
					        with nogil:
 | 
				
			||||||
 | 
					            self.parse(stcls, eg.c)
 | 
				
			||||||
        tokens.set_parse(stcls._sent)
 | 
					        tokens.set_parse(stcls._sent)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
 | 
					    cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
 | 
				
			||||||
        while not stcls.is_final():
 | 
					        while not stcls.is_final():
 | 
				
			||||||
            memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
 | 
					            memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.moves.set_valid(eg.is_valid, stcls)
 | 
					            self.moves.set_valid(eg.is_valid, stcls)
 | 
				
			||||||
            fill_context(eg.atoms, stcls)
 | 
					            fill_context(eg.atoms, stcls)
 | 
				
			||||||
            self.model.set_scores(eg.scores, eg.atoms)
 | 
					            self.model.set_scores(eg.scores, eg.atoms)
 | 
				
			||||||
            eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
 | 
					            eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
 | 
					            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
 | 
				
			||||||
        self.moves.finalize_state(stcls)
 | 
					        self.moves.finalize_state(stcls)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
# cython: profile=True
 | 
					 | 
				
			||||||
from libc.string cimport memcpy, memset
 | 
					from libc.string cimport memcpy, memset
 | 
				
			||||||
from libc.stdint cimport uint32_t
 | 
					from libc.stdint cimport uint32_t
 | 
				
			||||||
from ..vocab cimport EMPTY_LEXEME
 | 
					from ..vocab cimport EMPTY_LEXEME
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,6 +33,11 @@ cdef class TransitionSystem:
 | 
				
			||||||
        self.freqs = {}
 | 
					        self.freqs = {}
 | 
				
			||||||
        for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
 | 
					        for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
 | 
				
			||||||
            self.freqs[attr] = defaultdict(int)
 | 
					            self.freqs[attr] = defaultdict(int)
 | 
				
			||||||
 | 
					            self.freqs[attr][0] = 1
 | 
				
			||||||
 | 
					        # Ensure we've seen heads. Need an official dependency length limit...
 | 
				
			||||||
 | 
					        for i in range(512):
 | 
				
			||||||
 | 
					            self.freqs[HEAD][i] = 1
 | 
				
			||||||
 | 
					            self.freqs[HEAD][-i] = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int initialize_state(self, StateClass state) except -1:
 | 
					    cdef int initialize_state(self, StateClass state) except -1:
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,17 +71,6 @@ cdef class Doc:
 | 
				
			||||||
        self.is_tagged = False
 | 
					        self.is_tagged = False
 | 
				
			||||||
        self.is_parsed = False
 | 
					        self.is_parsed = False
 | 
				
			||||||
        self._py_tokens = []
 | 
					        self._py_tokens = []
 | 
				
			||||||
        cdef const LexemeC* lex
 | 
					 | 
				
			||||||
        cdef attr_t orth
 | 
					 | 
				
			||||||
        cdef bint space
 | 
					 | 
				
			||||||
        if orths_and_spaces is not None:
 | 
					 | 
				
			||||||
            for orth, space in orths_and_spaces:
 | 
					 | 
				
			||||||
                lex = <LexemeC*>self.vocab._by_orth.get(orth)
 | 
					 | 
				
			||||||
                if lex != NULL:
 | 
					 | 
				
			||||||
                    assert lex.orth == orth
 | 
					 | 
				
			||||||
                    self.push_back(lex, space)
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    raise Exception('Lexeme not found: %d' % orth)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __getitem__(self, object i):
 | 
					    def __getitem__(self, object i):
 | 
				
			||||||
        """Get a token.
 | 
					        """Get a token.
 | 
				
			||||||
| 
						 | 
					@ -122,9 +111,12 @@ cdef class Doc:
 | 
				
			||||||
    def __unicode__(self):
 | 
					    def __unicode__(self):
 | 
				
			||||||
        return u''.join([t.string for t in self])
 | 
					        return u''.join([t.string for t in self])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __str__(self):
 | 
				
			||||||
 | 
					        return u''.join([t.string for t in self])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def string(self):
 | 
					    def string(self):
 | 
				
			||||||
        return unicode(self)
 | 
					        return u''.join([t.string for t in self])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def ents(self):
 | 
					    def ents(self):
 | 
				
			||||||
| 
						 | 
					@ -303,12 +295,11 @@ cdef class Doc:
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_bytes(self):
 | 
					    def to_bytes(self):
 | 
				
			||||||
        bits = self.vocab.packer.pack(self)
 | 
					        byte_string = self.vocab.serializer.pack(self)
 | 
				
			||||||
        return struct.pack('I', len(bits)) + bits.as_bytes()
 | 
					        return struct.pack('I', len(byte_string)) + byte_string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, data):
 | 
					    def from_bytes(self, data):
 | 
				
			||||||
        bits = BitArray(data)
 | 
					        self.vocab.serializer.unpack_into(data[4:], self)
 | 
				
			||||||
        self.vocab.packer.unpack_into(bits, self)
 | 
					 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
| 
						 | 
					@ -316,15 +307,14 @@ cdef class Doc:
 | 
				
			||||||
        keep_reading = True
 | 
					        keep_reading = True
 | 
				
			||||||
        while keep_reading:
 | 
					        while keep_reading:
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                n_bits_str = file_.read(4)
 | 
					                n_bytes_str = file_.read(4)
 | 
				
			||||||
                if len(n_bits_str) < 4:
 | 
					                if len(n_bytes_str) < 4:
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
                n_bits = struct.unpack('I', n_bits_str)[0]
 | 
					                n_bytes = struct.unpack('I', n_bytes_str)[0]
 | 
				
			||||||
                n_bytes = n_bits // 8 + bool(n_bits % 8)
 | 
					 | 
				
			||||||
                data = file_.read(n_bytes)
 | 
					                data = file_.read(n_bytes)
 | 
				
			||||||
            except StopIteration:
 | 
					            except StopIteration:
 | 
				
			||||||
                keep_reading = False
 | 
					                keep_reading = False
 | 
				
			||||||
            yield data
 | 
					            yield n_bytes_str + data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # This function is terrible --- need to fix this.
 | 
					    # This function is terrible --- need to fix this.
 | 
				
			||||||
    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
 | 
					    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,6 +34,9 @@ cdef class Token:
 | 
				
			||||||
    def __unicode__(self):
 | 
					    def __unicode__(self):
 | 
				
			||||||
        return self.string
 | 
					        return self.string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __str__(self):
 | 
				
			||||||
 | 
					        return self.string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
 | 
					    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
 | 
				
			||||||
        return check_flag(self.c.lex, flag_id)
 | 
					        return check_flag(self.c.lex, flag_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -65,16 +65,6 @@ def read_tokenization(lang):
 | 
				
			||||||
    return entries
 | 
					    return entries
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_encoding_freqs(data_dir):
 | 
					 | 
				
			||||||
    tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
 | 
					 | 
				
			||||||
    heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
 | 
					 | 
				
			||||||
    deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
 | 
					 | 
				
			||||||
    iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
 | 
					 | 
				
			||||||
    ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
 | 
					 | 
				
			||||||
    return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
 | 
					 | 
				
			||||||
            (ENT_TYPE, ne_types)]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_detoken_rules(lang): # Deprecated?
 | 
					def read_detoken_rules(lang): # Deprecated?
 | 
				
			||||||
    loc = path.join(DATA_DIR, lang, 'detokenize')
 | 
					    loc = path.join(DATA_DIR, lang, 'detokenize')
 | 
				
			||||||
    entries = []
 | 
					    entries = []
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					from murmurhash.mrmr cimport hash64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .structs cimport LexemeC, TokenC
 | 
					from .structs cimport LexemeC, TokenC
 | 
				
			||||||
from .typedefs cimport utf8_t, hash_t
 | 
					from .typedefs cimport utf8_t, attr_t, hash_t
 | 
				
			||||||
from .strings cimport StringStore
 | 
					from .strings cimport StringStore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -29,9 +29,12 @@ cdef class Vocab:
 | 
				
			||||||
    cpdef readonly StringStore strings
 | 
					    cpdef readonly StringStore strings
 | 
				
			||||||
    cdef readonly object pos_tags
 | 
					    cdef readonly object pos_tags
 | 
				
			||||||
    cdef readonly int length
 | 
					    cdef readonly int length
 | 
				
			||||||
    cdef public object packer
 | 
					    cdef public object _serializer
 | 
				
			||||||
 | 
					    cdef public object data_dir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
 | 
					    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
 | 
				
			||||||
 | 
					    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
 | 
					    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef PreshMap _by_hash
 | 
					    cdef PreshMap _by_hash
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										108
									
								
								spacy/vocab.pyx
									
									
									
									
									
								
							
							
						
						
									
										108
									
								
								spacy/vocab.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -1,3 +1,6 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 | 
					from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
from libc.stdint cimport int32_t
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
| 
						 | 
					@ -6,6 +9,7 @@ import bz2
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
import codecs
 | 
					import codecs
 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .lexeme cimport EMPTY_LEXEME
 | 
					from .lexeme cimport EMPTY_LEXEME
 | 
				
			||||||
from .lexeme cimport set_lex_struct_props
 | 
					from .lexeme cimport set_lex_struct_props
 | 
				
			||||||
| 
						 | 
					@ -13,6 +17,7 @@ from .lexeme cimport Lexeme
 | 
				
			||||||
from .strings cimport hash_string
 | 
					from .strings cimport hash_string
 | 
				
			||||||
from .orth cimport word_shape
 | 
					from .orth cimport word_shape
 | 
				
			||||||
from .typedefs cimport attr_t
 | 
					from .typedefs cimport attr_t
 | 
				
			||||||
 | 
					from .cfile cimport CFile
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Address
 | 
					from cymem.cymem cimport Address
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
| 
						 | 
					@ -54,8 +59,19 @@ cdef class Vocab:
 | 
				
			||||||
            if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
 | 
					            if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
 | 
				
			||||||
                self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
 | 
					                self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        #self.packer = Packer(self, util.read_encoding_freqs(data_dir))
 | 
					        self._serializer = None
 | 
				
			||||||
        self.packer = None
 | 
					        self.data_dir = data_dir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property serializer:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            if self._serializer is None:
 | 
				
			||||||
 | 
					                freqs = []
 | 
				
			||||||
 | 
					                if self.data_dir is not None:
 | 
				
			||||||
 | 
					                    freqs_loc = path.join(self.data_dir, 'serializer.json')
 | 
				
			||||||
 | 
					                    if path.exists(freqs_loc):
 | 
				
			||||||
 | 
					                        freqs = json.load(open(freqs_loc))
 | 
				
			||||||
 | 
					                self._serializer = Packer(self, freqs)
 | 
				
			||||||
 | 
					            return self._serializer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        """The current number of lexemes stored."""
 | 
					        """The current number of lexemes stored."""
 | 
				
			||||||
| 
						 | 
					@ -82,6 +98,27 @@ cdef class Vocab:
 | 
				
			||||||
            self._add_lex_to_vocab(key, lex)
 | 
					            self._add_lex_to_vocab(key, lex)
 | 
				
			||||||
        return lex
 | 
					        return lex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
 | 
				
			||||||
 | 
					        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
 | 
				
			||||||
 | 
					        if necessary, using memory acquired from the given pool.  If the pool
 | 
				
			||||||
 | 
					        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
 | 
				
			||||||
 | 
					        cdef LexemeC* lex
 | 
				
			||||||
 | 
					        lex = <LexemeC*>self._by_orth.get(orth)
 | 
				
			||||||
 | 
					        if lex != NULL:
 | 
				
			||||||
 | 
					            return lex
 | 
				
			||||||
 | 
					        cdef unicode string = self.strings[orth]
 | 
				
			||||||
 | 
					        cdef bint is_oov = mem is not self.mem
 | 
				
			||||||
 | 
					        if len(string) < 3:
 | 
				
			||||||
 | 
					            mem = self.mem
 | 
				
			||||||
 | 
					        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
 | 
				
			||||||
 | 
					        props = self.lexeme_props_getter(string)
 | 
				
			||||||
 | 
					        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
 | 
				
			||||||
 | 
					        if is_oov:
 | 
				
			||||||
 | 
					            lex.id = 0
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            self._add_lex_to_vocab(hash_string(string), lex)
 | 
				
			||||||
 | 
					        return lex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
 | 
					    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
 | 
				
			||||||
        self._by_hash.set(key, <void*>lex)
 | 
					        self._by_hash.set(key, <void*>lex)
 | 
				
			||||||
        self._by_orth.set(lex.orth, <void*>lex)
 | 
					        self._by_orth.set(lex.orth, <void*>lex)
 | 
				
			||||||
| 
						 | 
					@ -138,19 +175,16 @@ cdef class Vocab:
 | 
				
			||||||
        if path.exists(loc):
 | 
					        if path.exists(loc):
 | 
				
			||||||
            assert not path.isdir(loc)
 | 
					            assert not path.isdir(loc)
 | 
				
			||||||
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
					        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
				
			||||||
        cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
 | 
					
 | 
				
			||||||
        assert fp != NULL
 | 
					        cdef CFile fp = CFile(bytes_loc, 'wb')
 | 
				
			||||||
        cdef size_t st
 | 
					        cdef size_t st
 | 
				
			||||||
        cdef size_t addr
 | 
					        cdef size_t addr
 | 
				
			||||||
        cdef hash_t key
 | 
					        cdef hash_t key
 | 
				
			||||||
        for key, addr in self._by_hash.items():
 | 
					        for key, addr in self._by_hash.items():
 | 
				
			||||||
            lexeme = <LexemeC*>addr
 | 
					            lexeme = <LexemeC*>addr
 | 
				
			||||||
            st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
 | 
					            fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
 | 
				
			||||||
            assert st == 1
 | 
					            fp.write_from(lexeme, sizeof(LexemeC), 1)
 | 
				
			||||||
            st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
 | 
					        fp.close()
 | 
				
			||||||
            assert st == 1
 | 
					 | 
				
			||||||
        st = fclose(fp)
 | 
					 | 
				
			||||||
        assert st == 0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def load_lexemes(self, strings_loc, loc):
 | 
					    def load_lexemes(self, strings_loc, loc):
 | 
				
			||||||
        self.strings.load(strings_loc)
 | 
					        self.strings.load(strings_loc)
 | 
				
			||||||
| 
						 | 
					@ -188,7 +222,7 @@ cdef class Vocab:
 | 
				
			||||||
        fclose(fp)
 | 
					        fclose(fp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def load_rep_vectors(self, loc):
 | 
					    def load_rep_vectors(self, loc):
 | 
				
			||||||
        file_ = _CFile(loc, b'rb')
 | 
					        cdef CFile file_ = CFile(loc, b'rb')
 | 
				
			||||||
        cdef int32_t word_len
 | 
					        cdef int32_t word_len
 | 
				
			||||||
        cdef int32_t vec_len
 | 
					        cdef int32_t vec_len
 | 
				
			||||||
        cdef int32_t prev_vec_len = 0
 | 
					        cdef int32_t prev_vec_len = 0
 | 
				
			||||||
| 
						 | 
					@ -198,22 +232,20 @@ cdef class Vocab:
 | 
				
			||||||
        cdef bytes py_word
 | 
					        cdef bytes py_word
 | 
				
			||||||
        cdef vector[float*] vectors
 | 
					        cdef vector[float*] vectors
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
 | 
					        cdef Pool tmp_mem = Pool()
 | 
				
			||||||
        while True:
 | 
					        while True:
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                file_.read(&word_len, sizeof(word_len), 1)
 | 
					                file_.read_into(&word_len, sizeof(word_len), 1)
 | 
				
			||||||
            except IOError:
 | 
					            except IOError:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
            file_.read(&vec_len, sizeof(vec_len), 1)
 | 
					            file_.read_into(&vec_len, sizeof(vec_len), 1)
 | 
				
			||||||
            if prev_vec_len != 0 and vec_len != prev_vec_len:
 | 
					            if prev_vec_len != 0 and vec_len != prev_vec_len:
 | 
				
			||||||
                raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
 | 
					                raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
 | 
				
			||||||
            if 0 >= vec_len >= MAX_VEC_SIZE:
 | 
					            if 0 >= vec_len >= MAX_VEC_SIZE:
 | 
				
			||||||
                raise VectorReadError.bad_size(loc, vec_len)
 | 
					                raise VectorReadError.bad_size(loc, vec_len)
 | 
				
			||||||
            mem = Address(word_len, sizeof(char))
 | 
					 | 
				
			||||||
            chars = <char*>mem.ptr
 | 
					 | 
				
			||||||
            vec = <float*>self.mem.alloc(vec_len, sizeof(float))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            file_.read(chars, sizeof(char), word_len)
 | 
					            chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
 | 
				
			||||||
            file_.read(vec, sizeof(float), vec_len)
 | 
					            vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            string_id = self.strings[chars[:word_len]]
 | 
					            string_id = self.strings[chars[:word_len]]
 | 
				
			||||||
            while string_id >= vectors.size():
 | 
					            while string_id >= vectors.size():
 | 
				
			||||||
| 
						 | 
					@ -235,7 +267,7 @@ cdef class Vocab:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def write_binary_vectors(in_loc, out_loc):
 | 
					def write_binary_vectors(in_loc, out_loc):
 | 
				
			||||||
    cdef _CFile out_file = _CFile(out_loc, 'wb')
 | 
					    cdef CFile out_file = CFile(out_loc, 'wb')
 | 
				
			||||||
    cdef Address mem
 | 
					    cdef Address mem
 | 
				
			||||||
    cdef int32_t word_len
 | 
					    cdef int32_t word_len
 | 
				
			||||||
    cdef int32_t vec_len
 | 
					    cdef int32_t vec_len
 | 
				
			||||||
| 
						 | 
					@ -252,42 +284,12 @@ def write_binary_vectors(in_loc, out_loc):
 | 
				
			||||||
            word_len = len(word)
 | 
					            word_len = len(word)
 | 
				
			||||||
            vec_len = len(pieces)
 | 
					            vec_len = len(pieces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            out_file.write(sizeof(word_len), 1, &word_len)
 | 
					            out_file.write_from(&word_len, 1, sizeof(word_len))
 | 
				
			||||||
            out_file.write(sizeof(vec_len), 1, &vec_len)
 | 
					            out_file.write_from(&vec_len, 1, sizeof(vec_len))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            chars = <char*>word
 | 
					            chars = <char*>word
 | 
				
			||||||
            out_file.write(sizeof(char), len(word), chars)
 | 
					            out_file.write_from(chars, len(word), sizeof(char))
 | 
				
			||||||
            out_file.write(sizeof(float), vec_len, vec)
 | 
					            out_file.write_from(vec, vec_len, sizeof(float))
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class _CFile:
 | 
					 | 
				
			||||||
    cdef FILE* fp
 | 
					 | 
				
			||||||
    def __init__(self, loc, bytes mode):
 | 
					 | 
				
			||||||
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
					 | 
				
			||||||
        self.fp = fopen(<char*>bytes_loc, mode)
 | 
					 | 
				
			||||||
        if self.fp == NULL:
 | 
					 | 
				
			||||||
            raise IOError
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __dealloc__(self):
 | 
					 | 
				
			||||||
        fclose(self.fp)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def close(self):
 | 
					 | 
				
			||||||
        fclose(self.fp)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef int read(self, void* dest, size_t elem_size, size_t n) except -1:
 | 
					 | 
				
			||||||
        st = fread(dest, elem_size, n, self.fp)
 | 
					 | 
				
			||||||
        if st != n:
 | 
					 | 
				
			||||||
            raise IOError
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef int write(self, size_t elem_size, size_t n, void* data) except -1:
 | 
					 | 
				
			||||||
        st = fwrite(data, elem_size, n, self.fp)
 | 
					 | 
				
			||||||
        if st != n:
 | 
					 | 
				
			||||||
            raise IOError
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef int write_unicode(self, unicode value):
 | 
					 | 
				
			||||||
        cdef bytes py_bytes = value.encode('utf8')
 | 
					 | 
				
			||||||
        cdef char* chars = <char*>py_bytes
 | 
					 | 
				
			||||||
        self.write(sizeof(char), len(py_bytes), chars)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class VectorReadError(Exception):
 | 
					class VectorReadError(Exception):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,3 +7,19 @@ import os
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
 | 
					    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
 | 
				
			||||||
    return English(data_dir=data_dir)
 | 
					    return English(data_dir=data_dir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def pytest_addoption(parser):
 | 
				
			||||||
 | 
					    parser.addoption("--models", action="store_true",
 | 
				
			||||||
 | 
					        help="include tests that require full models")
 | 
				
			||||||
 | 
					    parser.addoption("--vectors", action="store_true",
 | 
				
			||||||
 | 
					        help="include word vectors tests")
 | 
				
			||||||
 | 
					    parser.addoption("--slow", action="store_true",
 | 
				
			||||||
 | 
					        help="include slow tests")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def pytest_runtest_setup(item):
 | 
				
			||||||
 | 
					    for opt in ['models', 'vectors', 'slow']:
 | 
				
			||||||
 | 
					        if opt in item.keywords and not item.config.getoption("--%s" % opt):
 | 
				
			||||||
 | 
					            pytest.skip("need --%s option to run" % opt)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_simple_types(EN):
 | 
					def test_simple_types(EN):
 | 
				
			||||||
    tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
 | 
					    tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
 | 
				
			||||||
    ents = list(tokens.ents)
 | 
					    ents = list(tokens.ents)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_root(EN):
 | 
					def test_root(EN):
 | 
				
			||||||
    tokens = EN(u"i don't have other assistance")
 | 
					    tokens = EN(u"i don't have other assistance")
 | 
				
			||||||
    for t in tokens:
 | 
					    for t in tokens:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,6 +12,7 @@ def sun_text():
 | 
				
			||||||
    return text
 | 
					    return text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_consistency(EN, sun_text):
 | 
					def test_consistency(EN, sun_text):
 | 
				
			||||||
    tokens = EN(sun_text)
 | 
					    tokens = EN(sun_text)
 | 
				
			||||||
    for head in tokens:
 | 
					    for head in tokens:
 | 
				
			||||||
| 
						 | 
					@ -21,6 +22,7 @@ def test_consistency(EN, sun_text):
 | 
				
			||||||
            assert child.head is head
 | 
					            assert child.head is head
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_child_consistency(EN, sun_text):
 | 
					def test_child_consistency(EN, sun_text):
 | 
				
			||||||
    tokens = EN(sun_text)
 | 
					    tokens = EN(sun_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -53,6 +55,7 @@ def test_child_consistency(EN, sun_text):
 | 
				
			||||||
        assert not children
 | 
					        assert not children
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_edges(EN):
 | 
					def test_edges(EN):
 | 
				
			||||||
    sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
 | 
					    sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
 | 
				
			||||||
    tokens = EN(sun_text)
 | 
					    tokens = EN(sun_text)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,8 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_subtrees(EN):
 | 
					def test_subtrees(EN):
 | 
				
			||||||
    sent = EN('The four wheels on the bus turned quickly')
 | 
					    sent = EN('The four wheels on the bus turned quickly')
 | 
				
			||||||
    wheels = sent[2]
 | 
					    wheels = sent[2]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -45,7 +45,7 @@ def test1():
 | 
				
			||||||
    codec = HuffmanCodec(list(enumerate(probs)))
 | 
					    codec = HuffmanCodec(list(enumerate(probs)))
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    py_codes = py_encode(dict(enumerate(probs)))
 | 
					    py_codes = py_encode(dict(enumerate(probs)))
 | 
				
			||||||
    py_codes = py_codes.items()
 | 
					    py_codes = list(py_codes.items())
 | 
				
			||||||
    py_codes.sort()
 | 
					    py_codes.sort()
 | 
				
			||||||
    assert codec.strings == [c for i, c in py_codes]
 | 
					    assert codec.strings == [c for i, c in py_codes]
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
| 
						 | 
					@ -60,7 +60,7 @@ def test_round_trip():
 | 
				
			||||||
    strings = list(codec.strings)
 | 
					    strings = list(codec.strings)
 | 
				
			||||||
    codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
 | 
					    codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
 | 
				
			||||||
    bits = codec.encode(message)
 | 
					    bits = codec.encode(message)
 | 
				
			||||||
    string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
 | 
					    string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes())
 | 
				
			||||||
    for word in message:
 | 
					    for word in message:
 | 
				
			||||||
        code = codes[word]
 | 
					        code = codes[word]
 | 
				
			||||||
        assert string[:len(code)] == code
 | 
					        assert string[:len(code)] == code
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@ def test_rosetta():
 | 
				
			||||||
    symb2freq = defaultdict(int)
 | 
					    symb2freq = defaultdict(int)
 | 
				
			||||||
    for ch in txt:
 | 
					    for ch in txt:
 | 
				
			||||||
        symb2freq[ch] += 1
 | 
					        symb2freq[ch] += 1
 | 
				
			||||||
    by_freq = symb2freq.items()
 | 
					    by_freq = list(symb2freq.items())
 | 
				
			||||||
    by_freq.sort(reverse=True, key=lambda item: item[1])
 | 
					    by_freq.sort(reverse=True, key=lambda item: item[1])
 | 
				
			||||||
    symbols = [sym for sym, prob in by_freq]
 | 
					    symbols = [sym for sym, prob in by_freq]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -96,6 +96,7 @@ def test_rosetta():
 | 
				
			||||||
    assert my_exp_len == py_exp_len
 | 
					    assert my_exp_len == py_exp_len
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.slow
 | 
				
			||||||
def test_vocab(EN):
 | 
					def test_vocab(EN):
 | 
				
			||||||
    codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
 | 
					    codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
 | 
				
			||||||
    expected_length = 0
 | 
					    expected_length = 0
 | 
				
			||||||
| 
						 | 
					@ -105,6 +106,7 @@ def test_vocab(EN):
 | 
				
			||||||
    assert 8 < expected_length < 15
 | 
					    assert 8 < expected_length < 15
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.slow
 | 
				
			||||||
def test_freqs():
 | 
					def test_freqs():
 | 
				
			||||||
    freqs = []
 | 
					    freqs = []
 | 
				
			||||||
    words = []
 | 
					    words = []
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										23
									
								
								tests/serialize/test_io.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								tests/serialize/test_io.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,23 @@
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.serialize.packer import Packer
 | 
				
			||||||
 | 
					from spacy.attrs import ORTH, SPACY
 | 
				
			||||||
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
 | 
					import math
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_read_write(EN):
 | 
				
			||||||
 | 
					    doc1 = EN(u'This is a simple test. With a couple of sentences.')
 | 
				
			||||||
 | 
					    doc2 = EN(u'This is another test document.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open('/tmp/spacy_docs.bin', 'wb') as file_:
 | 
				
			||||||
 | 
					        file_.write(doc1.to_bytes())
 | 
				
			||||||
 | 
					        file_.write(doc2.to_bytes())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open('/tmp/spacy_docs.bin', 'rb') as file_:
 | 
				
			||||||
 | 
					        bytes1, bytes2 = Doc.read_bytes(file_)
 | 
				
			||||||
 | 
					        r1 = Doc(EN.vocab).from_bytes(bytes1)
 | 
				
			||||||
 | 
					        r2 = Doc(EN.vocab).from_bytes(bytes2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert r1.string == doc1.string
 | 
				
			||||||
 | 
					    assert r2.string == doc2.string
 | 
				
			||||||
| 
						 | 
					@ -56,12 +56,12 @@ def test_char_packer(vocab):
 | 
				
			||||||
    bits = BitArray()
 | 
					    bits = BitArray()
 | 
				
			||||||
    bits.seek(0)
 | 
					    bits.seek(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    byte_str = b'the dog jumped'
 | 
					    byte_str = bytearray(b'the dog jumped')
 | 
				
			||||||
    packer.char_codec.encode(byte_str, bits)
 | 
					    packer.char_codec.encode(byte_str, bits)
 | 
				
			||||||
    bits.seek(0)
 | 
					    bits.seek(0)
 | 
				
			||||||
    result = [b''] * len(byte_str)
 | 
					    result = [b''] * len(byte_str)
 | 
				
			||||||
    packer.char_codec.decode(bits, result)
 | 
					    packer.char_codec.decode(bits, result)
 | 
				
			||||||
    assert b''.join(result) == byte_str
 | 
					    assert bytearray(result) == byte_str
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_packer_unannotated(tokenizer):
 | 
					def test_packer_unannotated(tokenizer):
 | 
				
			||||||
| 
						 | 
					@ -120,5 +120,3 @@ def test_packer_annotated(tokenizer):
 | 
				
			||||||
    assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
 | 
					    assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
 | 
				
			||||||
    assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
 | 
					    assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
 | 
				
			||||||
    assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
 | 
					    assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,8 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_merge_tokens(EN):
 | 
					def test_merge_tokens(EN):
 | 
				
			||||||
    tokens = EN(u'Los Angeles start.')
 | 
					    tokens = EN(u'Los Angeles start.')
 | 
				
			||||||
    assert len(tokens) == 4
 | 
					    assert len(tokens) == 4
 | 
				
			||||||
| 
						 | 
					@ -12,6 +14,7 @@ def test_merge_tokens(EN):
 | 
				
			||||||
    assert tokens[0].head.orth_ == 'start'
 | 
					    assert tokens[0].head.orth_ == 'start'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_merge_heads(EN):
 | 
					def test_merge_heads(EN):
 | 
				
			||||||
    tokens = EN(u'I found a pilates class near work.')
 | 
					    tokens = EN(u'I found a pilates class near work.')
 | 
				
			||||||
    assert len(tokens) == 8
 | 
					    assert len(tokens) == 8
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,6 +9,7 @@ def doc(EN):
 | 
				
			||||||
    return EN('This is a sentence. This is another sentence. And a third.')
 | 
					    return EN('This is a sentence. This is another sentence. And a third.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_sent_spans(doc):
 | 
					def test_sent_spans(doc):
 | 
				
			||||||
    sents = list(doc.sents)
 | 
					    sents = list(doc.sents)
 | 
				
			||||||
    assert sents[0].start == 0
 | 
					    assert sents[0].start == 0
 | 
				
			||||||
| 
						 | 
					@ -17,6 +18,7 @@ def test_sent_spans(doc):
 | 
				
			||||||
    assert sum(len(sent) for sent in sents) == len(doc)
 | 
					    assert sum(len(sent) for sent in sents) == len(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_root(doc):
 | 
					def test_root(doc):
 | 
				
			||||||
    np = doc[2:4]
 | 
					    np = doc[2:4]
 | 
				
			||||||
    assert len(np) == 2
 | 
					    assert len(np) == 2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_am_pm(en_nlp):
 | 
					def test_am_pm(en_nlp):
 | 
				
			||||||
    numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
 | 
					    numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
 | 
				
			||||||
    variants = ['a.m.', 'am', 'p.m.', 'pm']
 | 
					    variants = ['a.m.', 'am', 'p.m.', 'pm']
 | 
				
			||||||
| 
						 | 
					@ -14,7 +15,7 @@ def test_am_pm(en_nlp):
 | 
				
			||||||
                tokens = en_nlp(string, merge_mwes=True)
 | 
					                tokens = en_nlp(string, merge_mwes=True)
 | 
				
			||||||
                assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
 | 
					                assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
 | 
				
			||||||
                ents = list(tokens.ents)
 | 
					                ents = list(tokens.ents)
 | 
				
			||||||
                assert len(ents) == 1
 | 
					                assert len(ents) == 1, ents
 | 
				
			||||||
                assert ents[0].label_ == 'TIME', string
 | 
					                assert ents[0].label_ == 'TIME', string
 | 
				
			||||||
                if ents[0].start == 4 and ents[0].end == 5:
 | 
					                if ents[0].start == 4 and ents[0].end == 5:
 | 
				
			||||||
                    assert ents[0].orth_ == '%s%s%s' % (num, space, var)
 | 
					                    assert ents[0].orth_ == '%s%s%s' % (num, space, var)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,6 +17,7 @@ def lemmas(tagged):
 | 
				
			||||||
    return [t.lemma_ for t in tagged]
 | 
					    return [t.lemma_ for t in tagged]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_lemmas(lemmas, tagged):
 | 
					def test_lemmas(lemmas, tagged):
 | 
				
			||||||
    assert lemmas[0] == 'banana'
 | 
					    assert lemmas[0] == 'banana'
 | 
				
			||||||
    assert lemmas[1] == 'in'
 | 
					    assert lemmas[1] == 'in'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,6 +12,7 @@ def morph_exc():
 | 
				
			||||||
           }
 | 
					           }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_load_exc(morph_exc):
 | 
					def test_load_exc(morph_exc):
 | 
				
			||||||
    # Do this local as we want to modify it
 | 
					    # Do this local as we want to modify it
 | 
				
			||||||
    nlp =  English()
 | 
					    nlp =  English()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,9 @@
 | 
				
			||||||
from spacy.en import English
 | 
					from spacy.en import English
 | 
				
			||||||
import six
 | 
					import six
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_tag_names(EN):
 | 
					def test_tag_names(EN):
 | 
				
			||||||
    tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
 | 
					    tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
 | 
				
			||||||
    pizza = tokens[2]
 | 
					    pizza = tokens[2]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,9 @@
 | 
				
			||||||
# -*- coding: utf-8 -*-
 | 
					# -*- coding: utf-8 -*-
 | 
				
			||||||
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
 | 
					"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
 | 
				
			||||||
from spacy.en.attrs import IS_LOWER
 | 
					from spacy.en.attrs import IS_LOWER
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_1():
 | 
					def test_1():
 | 
				
			||||||
    import spacy.en
 | 
					    import spacy.en
 | 
				
			||||||
    from spacy.parts_of_speech import ADV
 | 
					    from spacy.parts_of_speech import ADV
 | 
				
			||||||
| 
						 | 
					@ -21,6 +22,7 @@ def test_1():
 | 
				
			||||||
    assert o == -11.07155704498291
 | 
					    assert o == -11.07155704498291
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test2():
 | 
					def test2():
 | 
				
			||||||
    import spacy.en
 | 
					    import spacy.en
 | 
				
			||||||
    from spacy.parts_of_speech import ADV
 | 
					    from spacy.parts_of_speech import ADV
 | 
				
			||||||
| 
						 | 
					@ -41,6 +43,7 @@ def test2():
 | 
				
			||||||
    -11.07155704498291
 | 
					    -11.07155704498291
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test3():
 | 
					def test3():
 | 
				
			||||||
    import spacy.en
 | 
					    import spacy.en
 | 
				
			||||||
    from spacy.parts_of_speech import ADV
 | 
					    from spacy.parts_of_speech import ADV
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@ def test_attr_of_token(EN):
 | 
				
			||||||
    assert feats_array[0][0] != feats_array[0][1]
 | 
					    assert feats_array[0][0] != feats_array[0][1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_tag(EN):
 | 
					def test_tag(EN):
 | 
				
			||||||
    text = u'A nice sentence.'
 | 
					    text = u'A nice sentence.'
 | 
				
			||||||
    tokens = EN(text)
 | 
					    tokens = EN(text)
 | 
				
			||||||
| 
						 | 
					@ -26,6 +27,7 @@ def test_tag(EN):
 | 
				
			||||||
    assert feats_array[3][1] == tokens[3].tag
 | 
					    assert feats_array[3][1] == tokens[3].tag
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_dep(EN):
 | 
					def test_dep(EN):
 | 
				
			||||||
    text = u'A nice sentence.'
 | 
					    text = u'A nice sentence.'
 | 
				
			||||||
    tokens = EN(text)
 | 
					    tokens = EN(text)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,6 +4,7 @@ import pytest
 | 
				
			||||||
from spacy.parts_of_speech import ADV
 | 
					from spacy.parts_of_speech import ADV
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_prob(EN):
 | 
					def test_prob(EN):
 | 
				
			||||||
    tokens = EN(u'Give it back', parse=False)
 | 
					    tokens = EN(u'Give it back', parse=False)
 | 
				
			||||||
    give = tokens[0]
 | 
					    give = tokens[0]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,7 @@ from spacy.en.attrs import IS_STOP
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_strings(EN):
 | 
					def test_strings(EN):
 | 
				
			||||||
    tokens = EN(u'Give it back! He pleaded.')
 | 
					    tokens = EN(u'Give it back! He pleaded.')
 | 
				
			||||||
    token = tokens[0]
 | 
					    token = tokens[0]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,6 +9,7 @@ data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
 | 
				
			||||||
# Let this have its own instances, as we have to be careful about memory here
 | 
					# Let this have its own instances, as we have to be careful about memory here
 | 
				
			||||||
# that's the point, after all
 | 
					# that's the point, after all
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def get_orphan_token(text, i):
 | 
					def get_orphan_token(text, i):
 | 
				
			||||||
    nlp = English(load_vectors=False, data_dir=data_dir)
 | 
					    nlp = English(load_vectors=False, data_dir=data_dir)
 | 
				
			||||||
    tokens = nlp(text)
 | 
					    tokens = nlp(text)
 | 
				
			||||||
| 
						 | 
					@ -18,6 +19,7 @@ def get_orphan_token(text, i):
 | 
				
			||||||
    return token
 | 
					    return token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_orphan():
 | 
					def test_orphan():
 | 
				
			||||||
    orphan = get_orphan_token('An orphan token', 1)
 | 
					    orphan = get_orphan_token('An orphan token', 1)
 | 
				
			||||||
    gc.collect()
 | 
					    gc.collect()
 | 
				
			||||||
| 
						 | 
					@ -36,6 +38,7 @@ def _orphan_from_list(toks):
 | 
				
			||||||
    return lst
 | 
					    return lst
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_list_orphans():
 | 
					def test_list_orphans():
 | 
				
			||||||
    # Test case from NSchrading
 | 
					    # Test case from NSchrading
 | 
				
			||||||
    nlp = English(load_vectors=False, data_dir=data_dir)
 | 
					    nlp = English(load_vectors=False, data_dir=data_dir)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ from spacy.tokens import Doc
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_getitem(EN):
 | 
					def mest_getitem(EN):
 | 
				
			||||||
    tokens = EN(u'Give it back! He pleaded.')
 | 
					    tokens = EN(u'Give it back! He pleaded.')
 | 
				
			||||||
    assert tokens[0].orth_ == 'Give'
 | 
					    assert tokens[0].orth_ == 'Give'
 | 
				
			||||||
    assert tokens[-1].orth_ == '.'
 | 
					    assert tokens[-1].orth_ == '.'
 | 
				
			||||||
| 
						 | 
					@ -13,10 +13,19 @@ def test_getitem(EN):
 | 
				
			||||||
        tokens[len(tokens)]
 | 
					        tokens[len(tokens)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_serialize(EN):
 | 
					def mest_serialize(EN):
 | 
				
			||||||
    tokens = EN(u' Give it back! He pleaded. ')
 | 
					    tokens = EN(u'Give it back! He pleaded.')
 | 
				
			||||||
    packed = tokens.serialize()
 | 
					    packed = tokens.to_bytes()
 | 
				
			||||||
    new_tokens = Doc.deserialize(EN.vocab, packed)
 | 
					    new_tokens = Doc(EN.vocab).from_bytes(packed)
 | 
				
			||||||
 | 
					    assert tokens.string == new_tokens.string
 | 
				
			||||||
 | 
					    assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
 | 
				
			||||||
 | 
					    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_serialize_whitespace(EN):
 | 
				
			||||||
 | 
					    tokens = EN(u' Give it back! He pleaded. ')
 | 
				
			||||||
 | 
					    packed = tokens.to_bytes()
 | 
				
			||||||
 | 
					    new_tokens = Doc(EN.vocab).from_bytes(packed)
 | 
				
			||||||
    assert tokens.string == new_tokens.string
 | 
					    assert tokens.string == new_tokens.string
 | 
				
			||||||
    assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
 | 
					    assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
 | 
				
			||||||
    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
 | 
					    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,13 +4,14 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.vectors
 | 
				
			||||||
def test_vec(EN):
 | 
					def test_vec(EN):
 | 
				
			||||||
    hype = EN.vocab['hype']
 | 
					    hype = EN.vocab['hype']
 | 
				
			||||||
    assert hype.orth_ == 'hype'
 | 
					    assert hype.orth_ == 'hype'
 | 
				
			||||||
    assert 0.08 >= hype.repvec[0] > 0.07
 | 
					    assert 0.08 >= hype.repvec[0] > 0.07
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.vectors
 | 
				
			||||||
def test_capitalized(EN):
 | 
					def test_capitalized(EN):
 | 
				
			||||||
    hype = EN.vocab['Hype']
 | 
					    hype = EN.vocab['Hype']
 | 
				
			||||||
    assert hype.orth_ == 'Hype'
 | 
					    assert hype.orth_ == 'Hype'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,7 +39,7 @@ def test_retrieve_id(sstore):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_med_string(sstore):
 | 
					def test_med_string(sstore):
 | 
				
			||||||
    nine_char_string = sstore[b'0123456789']
 | 
					    nine_char_string = sstore[b'0123456789']
 | 
				
			||||||
    assert sstore[nine_char_string] == b'0123456789'
 | 
					    assert sstore[nine_char_string] == u'0123456789'
 | 
				
			||||||
    dummy = sstore[b'A']
 | 
					    dummy = sstore[b'A']
 | 
				
			||||||
    assert sstore[b'0123456789'] == nine_char_string
 | 
					    assert sstore[b'0123456789'] == nine_char_string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user