mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'master' of https://github.com/honnibal/spaCy
This commit is contained in:
commit
2e6a60eaec
17
.travis.yml
17
.travis.yml
|
@ -11,11 +11,18 @@ python:
|
|||
# install dependencies
|
||||
install:
|
||||
- "pip install --upgrade setuptools"
|
||||
- "rm -rf spacy/"
|
||||
- "pip install spacy"
|
||||
- "pip install cython fabric fabtools"
|
||||
- "pip install -r requirements.txt"
|
||||
- "python setup.py build_ext --inplace"
|
||||
- "mkdir -p corpora/en"
|
||||
- "cd corpora/en"
|
||||
- "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
|
||||
- "tar -xzf WordNet-3.0.tar.gz"
|
||||
- "mv WordNet-3.0 wordnet"
|
||||
- "cd ../../"
|
||||
- "export PYTHONPATH=`pwd`"
|
||||
- "python bin/init_model.py lang_data/en corpora/en spacy/en/data"
|
||||
|
||||
# run tests
|
||||
script:
|
||||
- py.test tests/tokenizer/
|
||||
- py.test tests/vocab/
|
||||
- py.test tests/tagger/
|
||||
- "py.test tests/ -x"
|
||||
|
|
27
bin/gather_freqs.py
Normal file
27
bin/gather_freqs.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import plac
|
||||
|
||||
def main(in_loc, out_loc):
|
||||
out_file = open(out_loc, 'w')
|
||||
this_key = None
|
||||
this_freq = 0
|
||||
df = 0
|
||||
for line in open(in_loc):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
freq, key = line.split('\t', 1)
|
||||
freq = int(freq)
|
||||
if this_key is not None and key != this_key:
|
||||
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
|
||||
this_key = key
|
||||
this_freq = freq
|
||||
df = 1
|
||||
else:
|
||||
this_freq += freq
|
||||
df += 1
|
||||
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
|
||||
out_file.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -15,6 +15,8 @@ Requires:
|
|||
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
|
||||
* vectors.tgz --- output of something like word2vec
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):
|
|||
|
||||
def _read_clusters(loc):
|
||||
if not loc.exists():
|
||||
print "Warning: Clusters file not found"
|
||||
print("Warning: Clusters file not found")
|
||||
return {}
|
||||
clusters = {}
|
||||
for line in codecs.open(str(loc), 'r', 'utf8'):
|
||||
|
@ -60,7 +62,7 @@ def _read_clusters(loc):
|
|||
else:
|
||||
clusters[word] = '0'
|
||||
# Expand clusters with re-casing
|
||||
for word, cluster in clusters.items():
|
||||
for word, cluster in list(clusters.items()):
|
||||
if word.lower() not in clusters:
|
||||
clusters[word.lower()] = cluster
|
||||
if word.title() not in clusters:
|
||||
|
@ -72,7 +74,7 @@ def _read_clusters(loc):
|
|||
|
||||
def _read_probs(loc):
|
||||
if not loc.exists():
|
||||
print "Warning: Probabilities file not found"
|
||||
print("Warning: Probabilities file not found")
|
||||
return {}
|
||||
probs = {}
|
||||
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
|
||||
|
@ -85,7 +87,7 @@ def _read_probs(loc):
|
|||
def _read_senses(loc):
|
||||
lexicon = defaultdict(lambda: defaultdict(list))
|
||||
if not loc.exists():
|
||||
print "Warning: WordNet senses not found"
|
||||
print("Warning: WordNet senses not found")
|
||||
return lexicon
|
||||
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
|
||||
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
|
||||
|
@ -109,13 +111,20 @@ def setup_vocab(src_dir, dst_dir):
|
|||
if vectors_src.exists():
|
||||
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
|
||||
else:
|
||||
print "Warning: Word vectors file not found"
|
||||
print("Warning: Word vectors file not found")
|
||||
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
||||
clusters = _read_clusters(src_dir / 'clusters.txt')
|
||||
probs = _read_probs(src_dir / 'words.sgt.prob')
|
||||
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
|
||||
if not probs:
|
||||
min_prob = 0.0
|
||||
else:
|
||||
min_prob = min(probs.values())
|
||||
for word in clusters:
|
||||
if word not in probs:
|
||||
probs[word] = min_prob
|
||||
|
||||
lexicon = []
|
||||
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
|
||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
entry = get_lex_props(word)
|
||||
if word in clusters or float(prob) >= -17:
|
||||
entry['prob'] = float(prob)
|
||||
|
@ -144,7 +153,7 @@ def main(lang_data_dir, corpora_dir, model_dir):
|
|||
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
|
||||
setup_vocab(corpora_dir, model_dir / 'vocab')
|
||||
if not (model_dir / 'wordnet').exists():
|
||||
copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
|
||||
copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
from os import path
|
||||
|
@ -107,7 +108,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
|
||||
nlp = Language(data_dir=model_dir)
|
||||
|
||||
print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
|
||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
loss = 0
|
||||
|
@ -138,9 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
nlp.entity.train(tokens, gold)
|
||||
nlp.tagger.train(tokens, gold.tags)
|
||||
random.shuffle(gold_tuples)
|
||||
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc)
|
||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc))
|
||||
nlp.end_training()
|
||||
|
||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||
|
@ -219,14 +220,14 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
|
|||
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||
print 'TOK', scorer.token_acc
|
||||
print 'POS', scorer.tags_acc
|
||||
print 'UAS', scorer.uas
|
||||
print 'LAS', scorer.las
|
||||
print('TOK', scorer.token_acc)
|
||||
print('POS', scorer.tags_acc)
|
||||
print('UAS', scorer.uas)
|
||||
print('LAS', scorer.las)
|
||||
|
||||
print 'NER P', scorer.ents_p
|
||||
print 'NER R', scorer.ents_r
|
||||
print 'NER F', scorer.ents_f
|
||||
print('NER P', scorer.ents_p)
|
||||
print('NER R', scorer.ents_r)
|
||||
print('NER F', scorer.ents_f)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
316709
corpora/en/clusters.txt
Normal file
316709
corpora/en/clusters.txt
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -2,7 +2,7 @@ cython
|
|||
cymem == 1.11
|
||||
pathlib
|
||||
preshed == 0.37
|
||||
thinc == 3.2
|
||||
thinc == 3.3
|
||||
murmurhash == 0.24
|
||||
unidecode
|
||||
numpy
|
||||
|
|
3
setup.py
3
setup.py
|
@ -120,7 +120,7 @@ def run_setup(exts):
|
|||
ext_modules=exts,
|
||||
license="Dual: Commercial or AGPL",
|
||||
install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
|
||||
'thinc == 3.2', "unidecode", 'wget', 'plac', 'six',
|
||||
'thinc == 3.3', "unidecode", 'wget', 'plac', 'six',
|
||||
'ujson'],
|
||||
setup_requires=["headers_workaround"],
|
||||
)
|
||||
|
@ -162,6 +162,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
|||
'spacy.gold', 'spacy.orth',
|
||||
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
||||
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
||||
'spacy.cfile',
|
||||
'spacy.syntax.ner']
|
||||
|
||||
|
||||
|
|
12
spacy/cfile.pxd
Normal file
12
spacy/cfile.pxd
Normal file
|
@ -0,0 +1,12 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
cdef class CFile:
|
||||
cdef FILE* fp
|
||||
cdef bint is_open
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
40
spacy/cfile.pyx
Normal file
40
spacy/cfile.pyx
Normal file
|
@ -0,0 +1,40 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
|
||||
|
||||
cdef class CFile:
|
||||
def __init__(self, loc, mode):
|
||||
if isinstance(mode, unicode):
|
||||
mode_str = mode.encode('ascii')
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self.fp = fopen(<char*>bytes_loc, mode_str)
|
||||
if self.fp == NULL:
|
||||
raise IOError("Could not open binary file %s" % bytes_loc)
|
||||
self.is_open = True
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.is_open:
|
||||
fclose(self.fp)
|
||||
|
||||
def close(self):
|
||||
fclose(self.fp)
|
||||
self.is_open = False
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
st = fread(dest, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
|
||||
st = fwrite(src, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
|
@ -95,15 +95,15 @@ class English(object):
|
|||
|
||||
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
|
||||
|
||||
if Tagger:
|
||||
if Tagger and path.exists(path.join(data_dir, 'pos')):
|
||||
self.tagger = Tagger(self.vocab.strings, data_dir)
|
||||
else:
|
||||
self.tagger = None
|
||||
if Parser:
|
||||
if Parser and path.exists(path.join(data_dir, 'deps')):
|
||||
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
|
||||
else:
|
||||
self.parser = None
|
||||
if Entity:
|
||||
if Entity and path.exists(path.join(data_dir, 'ner')):
|
||||
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
|
||||
else:
|
||||
self.entity = None
|
||||
|
@ -153,15 +153,14 @@ class English(object):
|
|||
self.tagger.model.end_training()
|
||||
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||
|
||||
packer = Packer(self.vocab, [
|
||||
(TAG, self.tagger.moves.freqs[TAG].items()),
|
||||
(HEAD, self.parser.moves.freqs[HEAD].items()),
|
||||
(DEP, self.parser.moves.freqs[DEP].items()),
|
||||
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
|
||||
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items())
|
||||
])
|
||||
|
||||
packer.dump(path.join(data_dir, 'vocab'))
|
||||
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
||||
file_.write(
|
||||
json.dumps([
|
||||
(TAG, self.tagger.freqs[TAG].items()),
|
||||
(DEP, self.parser.moves.freqs[DEP].items()),
|
||||
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
|
||||
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()),
|
||||
(HEAD, self.parser.moves.freqs[HEAD].items())]))
|
||||
|
||||
@property
|
||||
def tags(self):
|
||||
|
|
|
@ -14,6 +14,9 @@ from ..attrs cimport LEMMA as _LEMMA
|
|||
from ..attrs cimport POS as _POS
|
||||
from ..attrs cimport TAG as _TAG
|
||||
from ..attrs cimport DEP as _DEP
|
||||
from ..attrs cimport HEAD as _HEAD
|
||||
from ..attrs cimport ENT_IOB as _ENT_IOB
|
||||
from ..attrs cimport ENT_TYPE as _ENT_TYPE
|
||||
|
||||
|
||||
cpdef enum:
|
||||
|
|
|
@ -262,6 +262,9 @@ cdef class EnPosTagger:
|
|||
'morphs.json'))))
|
||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||
self.freqs = {TAG: defaultdict(int)}
|
||||
for tag in self.tag_names:
|
||||
self.freqs[TAG][self.strings[tag]] = 1
|
||||
self.freqs[TAG][0] = 1
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
|
|
@ -3,8 +3,6 @@ from cymem.cymem cimport Pool
|
|||
from .structs cimport TokenC
|
||||
from .syntax.transition_system cimport Transition
|
||||
|
||||
cimport numpy
|
||||
|
||||
|
||||
cdef struct GoldParseC:
|
||||
int* tags
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
import numpy
|
||||
import codecs
|
||||
import json
|
||||
import ujson
|
||||
import random
|
||||
import re
|
||||
import os
|
||||
|
@ -9,6 +7,11 @@ from os import path
|
|||
|
||||
from libc.string cimport memset
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
entities = []
|
||||
|
@ -128,7 +131,7 @@ def read_json_file(loc, docs_filter=None):
|
|||
yield from read_json_file(path.join(loc, filename))
|
||||
else:
|
||||
with open(loc) as file_:
|
||||
docs = ujson.load(file_)
|
||||
docs = json.load(file_)
|
||||
for doc in docs:
|
||||
if docs_filter is not None and not docs_filter(doc):
|
||||
continue
|
||||
|
|
|
@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
|
|||
|
||||
|
||||
cdef class BitArray:
|
||||
cdef bytes data
|
||||
cdef bytearray data
|
||||
cdef uchar byte
|
||||
cdef uchar bit_of_byte
|
||||
cdef uint32_t i
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memcpy
|
||||
|
||||
# Note that we're setting the most significant bits here first, when in practice
|
||||
|
@ -15,7 +17,7 @@ cdef Code bit_append(Code code, bint bit) nogil:
|
|||
|
||||
cdef class BitArray:
|
||||
def __init__(self, data=b''):
|
||||
self.data = data
|
||||
self.data = bytearray(data)
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
self.i = 0
|
||||
|
@ -45,7 +47,7 @@ cdef class BitArray:
|
|||
start_bit = self.i % 8
|
||||
|
||||
if start_bit != 0 and start_byte < len(self.data):
|
||||
byte = ord(self.data[start_byte])
|
||||
byte = self.data[start_byte]
|
||||
for i in range(start_bit, 8):
|
||||
self.i += 1
|
||||
yield 1 if (byte & (one << i)) else 0
|
||||
|
@ -68,18 +70,24 @@ cdef class BitArray:
|
|||
|
||||
# TODO portability
|
||||
cdef uchar[4] chars
|
||||
chars[0] = <uchar>ord(self.data[start_byte])
|
||||
chars[1] = <uchar>ord(self.data[start_byte+1])
|
||||
chars[2] = <uchar>ord(self.data[start_byte+2])
|
||||
chars[3] = <uchar>ord(self.data[start_byte+3])
|
||||
chars[0] = self.data[start_byte]
|
||||
chars[1] = self.data[start_byte+1]
|
||||
chars[2] = self.data[start_byte+2]
|
||||
chars[3] = self.data[start_byte+3]
|
||||
cdef uint32_t output
|
||||
memcpy(&output, chars, 4)
|
||||
self.i += 32
|
||||
return output
|
||||
|
||||
def as_bytes(self):
|
||||
cdef unsigned char byte_char
|
||||
if self.bit_of_byte != 0:
|
||||
return self.data + chr(self.byte)
|
||||
byte = chr(self.byte)
|
||||
# Jump through some hoops for Python3
|
||||
if isinstance(byte, unicode):
|
||||
return self.data + <bytes>(&self.byte)[:1]
|
||||
else:
|
||||
return self.data + chr(self.byte)
|
||||
else:
|
||||
return self.data
|
||||
|
||||
|
@ -92,7 +100,7 @@ cdef class BitArray:
|
|||
self.bit_of_byte += 1
|
||||
self.i += 1
|
||||
if self.bit_of_byte == 8:
|
||||
self.data += chr(self.byte)
|
||||
self.data += bytearray((self.byte,))
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
|
||||
|
@ -106,7 +114,7 @@ cdef class BitArray:
|
|||
self.byte &= ~(one << self.bit_of_byte)
|
||||
self.bit_of_byte += 1
|
||||
if self.bit_of_byte == 8:
|
||||
self.data += chr(self.byte)
|
||||
self.data += <bytes>self.byte
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
self.i += 1
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# cython: profile=True
|
||||
from __future__ import unicode_literals
|
||||
cimport cython
|
||||
from libcpp.queue cimport priority_queue
|
||||
from libcpp.pair cimport pair
|
||||
|
@ -110,14 +111,14 @@ cdef class HuffmanCodec:
|
|||
cdef int branch
|
||||
|
||||
cdef int n_msg = msg.shape[0]
|
||||
cdef bytes bytes_ = bits.as_bytes()
|
||||
cdef bytearray bytes_ = bits.as_bytes()
|
||||
cdef unsigned char byte
|
||||
cdef int i_msg = 0
|
||||
cdef int i_byte = bits.i // 8
|
||||
cdef unsigned char i_bit = 0
|
||||
cdef unsigned char one = 1
|
||||
while i_msg < n_msg:
|
||||
byte = ord(bytes_[i_byte])
|
||||
byte = bytes_[i_byte]
|
||||
i_byte += 1
|
||||
for i_bit in range(8):
|
||||
branch = node.right if (byte & (one << i_bit)) else node.left
|
||||
|
@ -138,11 +139,11 @@ cdef class HuffmanCodec:
|
|||
def __get__(self):
|
||||
output = []
|
||||
cdef int i, j
|
||||
cdef bytes string
|
||||
cdef unicode string
|
||||
cdef Code code
|
||||
for i in range(self.codes.size()):
|
||||
code = self.codes[i]
|
||||
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
|
||||
string = '{0:b}'.format(code.bits).rjust(code.length, '0')
|
||||
string = string[::-1]
|
||||
output.append(string)
|
||||
return output
|
||||
|
|
|
@ -10,6 +10,7 @@ from libcpp.pair cimport pair
|
|||
from cymem.cymem cimport Address, Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from preshed.counter cimport PreshCounter
|
||||
import json
|
||||
|
||||
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
from ..tokens.doc cimport Doc
|
||||
|
@ -65,7 +66,7 @@ def _gen_orths(Vocab vocab):
|
|||
def _gen_chars(Vocab vocab):
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
char_weights = {chr(i): 1e-20 for i in range(256)}
|
||||
char_weights = {i: 1e-20 for i in range(256)}
|
||||
cdef unicode string
|
||||
cdef bytes char
|
||||
cdef bytes utf8_str
|
||||
|
@ -74,9 +75,9 @@ def _gen_chars(Vocab vocab):
|
|||
string = vocab.strings[lex.orth]
|
||||
utf8_str = string.encode('utf8')
|
||||
for char in utf8_str:
|
||||
char_weights.setdefault(char, 0.0)
|
||||
char_weights[char] += c_exp(lex.prob)
|
||||
char_weights[b' '] += c_exp(lex.prob)
|
||||
char_weights.setdefault(ord(char), 0.0)
|
||||
char_weights[ord(char)] += c_exp(lex.prob)
|
||||
char_weights[ord(' ')] += c_exp(lex.prob)
|
||||
return char_weights.items()
|
||||
|
||||
|
||||
|
@ -98,33 +99,34 @@ cdef class Packer:
|
|||
self._codecs = tuple(codecs)
|
||||
self.attrs = tuple(attrs)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, Vocab vocab, data_dir):
|
||||
return cls(vocab, util.read_encoding_freqs(data_dir))
|
||||
|
||||
def pack(self, Doc doc):
|
||||
bits = self._orth_encode(doc)
|
||||
if bits is None:
|
||||
bits = self._char_encode(doc)
|
||||
|
||||
cdef int i
|
||||
if self.attrs:
|
||||
array = doc.to_array(self.attrs)
|
||||
for i, codec in enumerate(self._codecs):
|
||||
codec.encode_int32(array[:, i], bits)
|
||||
return bits
|
||||
codec.encode(array[:, i], bits)
|
||||
return bits.as_bytes()
|
||||
|
||||
def unpack(self, BitArray bits):
|
||||
def unpack(self, data):
|
||||
doc = Doc(self.vocab)
|
||||
self.unpack_into(data, doc)
|
||||
return doc
|
||||
|
||||
def unpack_into(self, byte_string, Doc doc):
|
||||
bits = BitArray(byte_string)
|
||||
bits.seek(0)
|
||||
cdef int32_t length = bits.read32()
|
||||
if length >= 0:
|
||||
doc = self._orth_decode(bits, length)
|
||||
self._orth_decode(bits, length, doc)
|
||||
else:
|
||||
doc = self._char_decode(bits, -length)
|
||||
|
||||
self._char_decode(bits, -length, doc)
|
||||
|
||||
array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
|
||||
for i, codec in enumerate(self._codecs):
|
||||
codec.decode_int32(bits, array[:, i])
|
||||
codec.decode(bits, array[:, i])
|
||||
|
||||
doc.from_array(self.attrs, array)
|
||||
return doc
|
||||
|
@ -141,20 +143,13 @@ cdef class Packer:
|
|||
bits.append(bool(token.whitespace_))
|
||||
return bits
|
||||
|
||||
def _orth_decode(self, BitArray bits, n):
|
||||
orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
|
||||
self.orth_codec.decode_int32(bits, orths)
|
||||
orths_and_spaces = zip(orths, bits)
|
||||
cdef Doc doc = Doc(self.vocab, orths_and_spaces)
|
||||
return doc
|
||||
|
||||
def _char_encode(self, Doc doc):
|
||||
cdef bytes utf8_str = doc.string.encode('utf8')
|
||||
cdef BitArray bits = BitArray()
|
||||
cdef int32_t length = len(utf8_str)
|
||||
# Signal chars with negative length
|
||||
bits.extend(-length, 32)
|
||||
self.char_codec.encode(utf8_str, bits)
|
||||
self.char_codec.encode(bytearray(utf8_str), bits)
|
||||
cdef int i, j
|
||||
for i in range(doc.length):
|
||||
for j in range(doc.data[i].lex.length-1):
|
||||
|
@ -164,12 +159,24 @@ cdef class Packer:
|
|||
bits.append(False)
|
||||
return bits
|
||||
|
||||
def _char_decode(self, BitArray bits, n):
|
||||
def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
|
||||
cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
|
||||
self.orth_codec.decode_int32(bits, orths)
|
||||
cdef int i
|
||||
cdef bint space
|
||||
spaces = iter(bits)
|
||||
for i in range(n):
|
||||
orth = orths[i]
|
||||
space = next(spaces)
|
||||
lex = self.vocab.get_by_orth(doc.mem, orth)
|
||||
doc.push_back(lex, space)
|
||||
return doc
|
||||
|
||||
def _char_decode(self, BitArray bits, int32_t n, Doc doc):
|
||||
cdef bytearray utf8_str = bytearray(n)
|
||||
self.char_codec.decode(bits, utf8_str)
|
||||
|
||||
cdef unicode string = utf8_str.decode('utf8')
|
||||
cdef Doc tokens = Doc(self.vocab)
|
||||
cdef int start = 0
|
||||
cdef bint is_spacy
|
||||
cdef int length = len(string)
|
||||
|
@ -178,11 +185,11 @@ cdef class Packer:
|
|||
for is_end_token in bits:
|
||||
if is_end_token:
|
||||
span = string[start:i+1]
|
||||
lex = self.vocab.get(tokens.mem, span)
|
||||
lex = self.vocab.get(doc.mem, span)
|
||||
is_spacy = (i+1) < length and string[i+1] == u' '
|
||||
tokens.push_back(lex, is_spacy)
|
||||
doc.push_back(lex, is_spacy)
|
||||
start = i + 1 + is_spacy
|
||||
i += 1
|
||||
if i >= n:
|
||||
break
|
||||
return tokens
|
||||
return doc
|
||||
|
|
|
@ -81,6 +81,7 @@ cdef class StringStore:
|
|||
def __getitem__(self, object string_or_id):
|
||||
cdef bytes byte_string
|
||||
cdef const Utf8Str* utf8str
|
||||
cdef int id_
|
||||
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
||||
if string_or_id == 0:
|
||||
return u''
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
"""
|
||||
Fill an array, context, with every _atomic_ value our features reference.
|
||||
We then write the _actual features_ as tuples of the atoms. The machinery
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ctypes
|
||||
|
|
|
@ -85,6 +85,9 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
elif gold.c.ner[i].move == OUT:
|
||||
self.freqs[ENT_IOB][1] += 1
|
||||
self.freqs[ENT_TYPE][0] += 1
|
||||
else:
|
||||
self.freqs[ENT_IOB][1] += 1
|
||||
self.freqs[ENT_TYPE][0] += 1
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
if name == '-':
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
"""
|
||||
MALT-style dependency parser
|
||||
"""
|
||||
|
@ -85,18 +84,17 @@ cdef class Parser:
|
|||
|
||||
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
|
||||
self.model.n_feats, self.model.n_feats)
|
||||
self.parse(stcls, eg.c)
|
||||
with nogil:
|
||||
self.parse(stcls, eg.c)
|
||||
tokens.set_parse(stcls._sent)
|
||||
|
||||
cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
|
||||
while not stcls.is_final():
|
||||
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
|
||||
|
||||
self.moves.set_valid(eg.is_valid, stcls)
|
||||
fill_context(eg.atoms, stcls)
|
||||
self.model.set_scores(eg.scores, eg.atoms)
|
||||
eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
|
||||
|
||||
self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
|
||||
self.moves.finalize_state(stcls)
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.stdint cimport uint32_t
|
||||
from ..vocab cimport EMPTY_LEXEME
|
||||
|
|
|
@ -33,6 +33,11 @@ cdef class TransitionSystem:
|
|||
self.freqs = {}
|
||||
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
|
||||
self.freqs[attr] = defaultdict(int)
|
||||
self.freqs[attr][0] = 1
|
||||
# Ensure we've seen heads. Need an official dependency length limit...
|
||||
for i in range(512):
|
||||
self.freqs[HEAD][i] = 1
|
||||
self.freqs[HEAD][-i] = 1
|
||||
|
||||
cdef int initialize_state(self, StateClass state) except -1:
|
||||
pass
|
||||
|
|
|
@ -71,17 +71,6 @@ cdef class Doc:
|
|||
self.is_tagged = False
|
||||
self.is_parsed = False
|
||||
self._py_tokens = []
|
||||
cdef const LexemeC* lex
|
||||
cdef attr_t orth
|
||||
cdef bint space
|
||||
if orths_and_spaces is not None:
|
||||
for orth, space in orths_and_spaces:
|
||||
lex = <LexemeC*>self.vocab._by_orth.get(orth)
|
||||
if lex != NULL:
|
||||
assert lex.orth == orth
|
||||
self.push_back(lex, space)
|
||||
else:
|
||||
raise Exception('Lexeme not found: %d' % orth)
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a token.
|
||||
|
@ -122,9 +111,12 @@ cdef class Doc:
|
|||
def __unicode__(self):
|
||||
return u''.join([t.string for t in self])
|
||||
|
||||
def __str__(self):
|
||||
return u''.join([t.string for t in self])
|
||||
|
||||
@property
|
||||
def string(self):
|
||||
return unicode(self)
|
||||
return u''.join([t.string for t in self])
|
||||
|
||||
@property
|
||||
def ents(self):
|
||||
|
@ -303,12 +295,11 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def to_bytes(self):
|
||||
bits = self.vocab.packer.pack(self)
|
||||
return struct.pack('I', len(bits)) + bits.as_bytes()
|
||||
byte_string = self.vocab.serializer.pack(self)
|
||||
return struct.pack('I', len(byte_string)) + byte_string
|
||||
|
||||
def from_bytes(self, data):
|
||||
bits = BitArray(data)
|
||||
self.vocab.packer.unpack_into(bits, self)
|
||||
self.vocab.serializer.unpack_into(data[4:], self)
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
|
@ -316,15 +307,14 @@ cdef class Doc:
|
|||
keep_reading = True
|
||||
while keep_reading:
|
||||
try:
|
||||
n_bits_str = file_.read(4)
|
||||
if len(n_bits_str) < 4:
|
||||
n_bytes_str = file_.read(4)
|
||||
if len(n_bytes_str) < 4:
|
||||
break
|
||||
n_bits = struct.unpack('I', n_bits_str)[0]
|
||||
n_bytes = n_bits // 8 + bool(n_bits % 8)
|
||||
n_bytes = struct.unpack('I', n_bytes_str)[0]
|
||||
data = file_.read(n_bytes)
|
||||
except StopIteration:
|
||||
keep_reading = False
|
||||
yield data
|
||||
yield n_bytes_str + data
|
||||
|
||||
# This function is terrible --- need to fix this.
|
||||
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
||||
|
|
|
@ -34,6 +34,9 @@ cdef class Token:
|
|||
def __unicode__(self):
|
||||
return self.string
|
||||
|
||||
def __str__(self):
|
||||
return self.string
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||
return check_flag(self.c.lex, flag_id)
|
||||
|
||||
|
|
|
@ -65,16 +65,6 @@ def read_tokenization(lang):
|
|||
return entries
|
||||
|
||||
|
||||
def read_encoding_freqs(data_dir):
|
||||
tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
|
||||
heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
|
||||
deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
|
||||
iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
|
||||
ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
|
||||
return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
|
||||
(ENT_TYPE, ne_types)]
|
||||
|
||||
|
||||
def read_detoken_rules(lang): # Deprecated?
|
||||
loc = path.join(DATA_DIR, lang, 'detokenize')
|
||||
entries = []
|
||||
|
|
|
@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
|
|||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .structs cimport LexemeC, TokenC
|
||||
from .typedefs cimport utf8_t, hash_t
|
||||
from .typedefs cimport utf8_t, attr_t, hash_t
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
||||
|
@ -29,9 +29,12 @@ cdef class Vocab:
|
|||
cpdef readonly StringStore strings
|
||||
cdef readonly object pos_tags
|
||||
cdef readonly int length
|
||||
cdef public object packer
|
||||
cdef public object _serializer
|
||||
cdef public object data_dir
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
|
||||
cdef PreshMap _by_hash
|
||||
|
|
108
spacy/vocab.pyx
108
spacy/vocab.pyx
|
@ -1,3 +1,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from libc.string cimport memset
|
||||
from libc.stdint cimport int32_t
|
||||
|
@ -6,6 +9,7 @@ import bz2
|
|||
from os import path
|
||||
import codecs
|
||||
import math
|
||||
import json
|
||||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport set_lex_struct_props
|
||||
|
@ -13,6 +17,7 @@ from .lexeme cimport Lexeme
|
|||
from .strings cimport hash_string
|
||||
from .orth cimport word_shape
|
||||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from . import util
|
||||
|
@ -54,8 +59,19 @@ cdef class Vocab:
|
|||
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
|
||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||
|
||||
#self.packer = Packer(self, util.read_encoding_freqs(data_dir))
|
||||
self.packer = None
|
||||
self._serializer = None
|
||||
self.data_dir = data_dir
|
||||
|
||||
property serializer:
|
||||
def __get__(self):
|
||||
if self._serializer is None:
|
||||
freqs = []
|
||||
if self.data_dir is not None:
|
||||
freqs_loc = path.join(self.data_dir, 'serializer.json')
|
||||
if path.exists(freqs_loc):
|
||||
freqs = json.load(open(freqs_loc))
|
||||
self._serializer = Packer(self, freqs)
|
||||
return self._serializer
|
||||
|
||||
def __len__(self):
|
||||
"""The current number of lexemes stored."""
|
||||
|
@ -82,6 +98,27 @@ cdef class Vocab:
|
|||
self._add_lex_to_vocab(key, lex)
|
||||
return lex
|
||||
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
cdef LexemeC* lex
|
||||
lex = <LexemeC*>self._by_orth.get(orth)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
cdef unicode string = self.strings[orth]
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
if len(string) < 3:
|
||||
mem = self.mem
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
props = self.lexeme_props_getter(string)
|
||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||
if is_oov:
|
||||
lex.id = 0
|
||||
else:
|
||||
self._add_lex_to_vocab(hash_string(string), lex)
|
||||
return lex
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||
self._by_hash.set(key, <void*>lex)
|
||||
self._by_orth.set(lex.orth, <void*>lex)
|
||||
|
@ -138,19 +175,16 @@ cdef class Vocab:
|
|||
if path.exists(loc):
|
||||
assert not path.isdir(loc)
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
||||
assert fp != NULL
|
||||
|
||||
cdef CFile fp = CFile(bytes_loc, 'wb')
|
||||
cdef size_t st
|
||||
cdef size_t addr
|
||||
cdef hash_t key
|
||||
for key, addr in self._by_hash.items():
|
||||
lexeme = <LexemeC*>addr
|
||||
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
|
||||
assert st == 1
|
||||
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
|
||||
assert st == 1
|
||||
st = fclose(fp)
|
||||
assert st == 0
|
||||
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
|
||||
fp.write_from(lexeme, sizeof(LexemeC), 1)
|
||||
fp.close()
|
||||
|
||||
def load_lexemes(self, strings_loc, loc):
|
||||
self.strings.load(strings_loc)
|
||||
|
@ -188,7 +222,7 @@ cdef class Vocab:
|
|||
fclose(fp)
|
||||
|
||||
def load_rep_vectors(self, loc):
|
||||
file_ = _CFile(loc, b'rb')
|
||||
cdef CFile file_ = CFile(loc, b'rb')
|
||||
cdef int32_t word_len
|
||||
cdef int32_t vec_len
|
||||
cdef int32_t prev_vec_len = 0
|
||||
|
@ -198,22 +232,20 @@ cdef class Vocab:
|
|||
cdef bytes py_word
|
||||
cdef vector[float*] vectors
|
||||
cdef int i
|
||||
cdef Pool tmp_mem = Pool()
|
||||
while True:
|
||||
try:
|
||||
file_.read(&word_len, sizeof(word_len), 1)
|
||||
file_.read_into(&word_len, sizeof(word_len), 1)
|
||||
except IOError:
|
||||
break
|
||||
file_.read(&vec_len, sizeof(vec_len), 1)
|
||||
file_.read_into(&vec_len, sizeof(vec_len), 1)
|
||||
if prev_vec_len != 0 and vec_len != prev_vec_len:
|
||||
raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
|
||||
if 0 >= vec_len >= MAX_VEC_SIZE:
|
||||
raise VectorReadError.bad_size(loc, vec_len)
|
||||
mem = Address(word_len, sizeof(char))
|
||||
chars = <char*>mem.ptr
|
||||
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
|
||||
|
||||
file_.read(chars, sizeof(char), word_len)
|
||||
file_.read(vec, sizeof(float), vec_len)
|
||||
chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
|
||||
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
|
||||
|
||||
string_id = self.strings[chars[:word_len]]
|
||||
while string_id >= vectors.size():
|
||||
|
@ -235,7 +267,7 @@ cdef class Vocab:
|
|||
|
||||
|
||||
def write_binary_vectors(in_loc, out_loc):
|
||||
cdef _CFile out_file = _CFile(out_loc, 'wb')
|
||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||
cdef Address mem
|
||||
cdef int32_t word_len
|
||||
cdef int32_t vec_len
|
||||
|
@ -252,42 +284,12 @@ def write_binary_vectors(in_loc, out_loc):
|
|||
word_len = len(word)
|
||||
vec_len = len(pieces)
|
||||
|
||||
out_file.write(sizeof(word_len), 1, &word_len)
|
||||
out_file.write(sizeof(vec_len), 1, &vec_len)
|
||||
out_file.write_from(&word_len, 1, sizeof(word_len))
|
||||
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
||||
|
||||
chars = <char*>word
|
||||
out_file.write(sizeof(char), len(word), chars)
|
||||
out_file.write(sizeof(float), vec_len, vec)
|
||||
|
||||
|
||||
cdef class _CFile:
|
||||
cdef FILE* fp
|
||||
def __init__(self, loc, bytes mode):
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self.fp = fopen(<char*>bytes_loc, mode)
|
||||
if self.fp == NULL:
|
||||
raise IOError
|
||||
|
||||
def __dealloc__(self):
|
||||
fclose(self.fp)
|
||||
|
||||
def close(self):
|
||||
fclose(self.fp)
|
||||
|
||||
cdef int read(self, void* dest, size_t elem_size, size_t n) except -1:
|
||||
st = fread(dest, elem_size, n, self.fp)
|
||||
if st != n:
|
||||
raise IOError
|
||||
|
||||
cdef int write(self, size_t elem_size, size_t n, void* data) except -1:
|
||||
st = fwrite(data, elem_size, n, self.fp)
|
||||
if st != n:
|
||||
raise IOError
|
||||
|
||||
cdef int write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
||||
out_file.write_from(chars, len(word), sizeof(char))
|
||||
out_file.write_from(vec, vec_len, sizeof(float))
|
||||
|
||||
|
||||
class VectorReadError(Exception):
|
||||
|
|
|
@ -7,3 +7,19 @@ import os
|
|||
def EN():
|
||||
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
|
||||
return English(data_dir=data_dir)
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption("--models", action="store_true",
|
||||
help="include tests that require full models")
|
||||
parser.addoption("--vectors", action="store_true",
|
||||
help="include word vectors tests")
|
||||
parser.addoption("--slow", action="store_true",
|
||||
help="include slow tests")
|
||||
|
||||
|
||||
|
||||
def pytest_runtest_setup(item):
|
||||
for opt in ['models', 'vectors', 'slow']:
|
||||
if opt in item.keywords and not item.config.getoption("--%s" % opt):
|
||||
pytest.skip("need --%s option to run" % opt)
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import pytest
|
||||
|
||||
@pytest.mark.models
|
||||
def test_simple_types(EN):
|
||||
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
|
||||
ents = list(tokens.ents)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_root(EN):
|
||||
tokens = EN(u"i don't have other assistance")
|
||||
for t in tokens:
|
||||
|
|
|
@ -12,6 +12,7 @@ def sun_text():
|
|||
return text
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_consistency(EN, sun_text):
|
||||
tokens = EN(sun_text)
|
||||
for head in tokens:
|
||||
|
@ -21,6 +22,7 @@ def test_consistency(EN, sun_text):
|
|||
assert child.head is head
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_child_consistency(EN, sun_text):
|
||||
tokens = EN(sun_text)
|
||||
|
||||
|
@ -53,6 +55,7 @@ def test_child_consistency(EN, sun_text):
|
|||
assert not children
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_edges(EN):
|
||||
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
|
||||
tokens = EN(sun_text)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_subtrees(EN):
|
||||
sent = EN('The four wheels on the bus turned quickly')
|
||||
wheels = sent[2]
|
||||
|
|
|
@ -45,7 +45,7 @@ def test1():
|
|||
codec = HuffmanCodec(list(enumerate(probs)))
|
||||
|
||||
py_codes = py_encode(dict(enumerate(probs)))
|
||||
py_codes = py_codes.items()
|
||||
py_codes = list(py_codes.items())
|
||||
py_codes.sort()
|
||||
assert codec.strings == [c for i, c in py_codes]
|
||||
|
||||
|
@ -60,7 +60,7 @@ def test_round_trip():
|
|||
strings = list(codec.strings)
|
||||
codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
|
||||
bits = codec.encode(message)
|
||||
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
|
||||
string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes())
|
||||
for word in message:
|
||||
code = codes[word]
|
||||
assert string[:len(code)] == code
|
||||
|
@ -76,7 +76,7 @@ def test_rosetta():
|
|||
symb2freq = defaultdict(int)
|
||||
for ch in txt:
|
||||
symb2freq[ch] += 1
|
||||
by_freq = symb2freq.items()
|
||||
by_freq = list(symb2freq.items())
|
||||
by_freq.sort(reverse=True, key=lambda item: item[1])
|
||||
symbols = [sym for sym, prob in by_freq]
|
||||
|
||||
|
@ -96,6 +96,7 @@ def test_rosetta():
|
|||
assert my_exp_len == py_exp_len
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_vocab(EN):
|
||||
codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
|
||||
expected_length = 0
|
||||
|
@ -105,6 +106,7 @@ def test_vocab(EN):
|
|||
assert 8 < expected_length < 15
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_freqs():
|
||||
freqs = []
|
||||
words = []
|
||||
|
|
23
tests/serialize/test_io.py
Normal file
23
tests/serialize/test_io.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
import pytest
|
||||
|
||||
from spacy.serialize.packer import Packer
|
||||
from spacy.attrs import ORTH, SPACY
|
||||
from spacy.tokens import Doc
|
||||
import math
|
||||
|
||||
|
||||
def test_read_write(EN):
|
||||
doc1 = EN(u'This is a simple test. With a couple of sentences.')
|
||||
doc2 = EN(u'This is another test document.')
|
||||
|
||||
with open('/tmp/spacy_docs.bin', 'wb') as file_:
|
||||
file_.write(doc1.to_bytes())
|
||||
file_.write(doc2.to_bytes())
|
||||
|
||||
with open('/tmp/spacy_docs.bin', 'rb') as file_:
|
||||
bytes1, bytes2 = Doc.read_bytes(file_)
|
||||
r1 = Doc(EN.vocab).from_bytes(bytes1)
|
||||
r2 = Doc(EN.vocab).from_bytes(bytes2)
|
||||
|
||||
assert r1.string == doc1.string
|
||||
assert r2.string == doc2.string
|
|
@ -56,12 +56,12 @@ def test_char_packer(vocab):
|
|||
bits = BitArray()
|
||||
bits.seek(0)
|
||||
|
||||
byte_str = b'the dog jumped'
|
||||
byte_str = bytearray(b'the dog jumped')
|
||||
packer.char_codec.encode(byte_str, bits)
|
||||
bits.seek(0)
|
||||
result = [b''] * len(byte_str)
|
||||
packer.char_codec.decode(bits, result)
|
||||
assert b''.join(result) == byte_str
|
||||
assert bytearray(result) == byte_str
|
||||
|
||||
|
||||
def test_packer_unannotated(tokenizer):
|
||||
|
@ -120,5 +120,3 @@ def test_packer_annotated(tokenizer):
|
|||
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
|
||||
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
|
||||
assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_merge_tokens(EN):
|
||||
tokens = EN(u'Los Angeles start.')
|
||||
assert len(tokens) == 4
|
||||
|
@ -12,6 +14,7 @@ def test_merge_tokens(EN):
|
|||
assert tokens[0].head.orth_ == 'start'
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_merge_heads(EN):
|
||||
tokens = EN(u'I found a pilates class near work.')
|
||||
assert len(tokens) == 8
|
||||
|
|
|
@ -9,6 +9,7 @@ def doc(EN):
|
|||
return EN('This is a sentence. This is another sentence. And a third.')
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
@ -17,6 +18,7 @@ def test_sent_spans(doc):
|
|||
assert sum(len(sent) for sent in sents) == len(doc)
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_root(doc):
|
||||
np = doc[2:4]
|
||||
assert len(np) == 2
|
||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_am_pm(en_nlp):
|
||||
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
|
||||
variants = ['a.m.', 'am', 'p.m.', 'pm']
|
||||
|
@ -14,7 +15,7 @@ def test_am_pm(en_nlp):
|
|||
tokens = en_nlp(string, merge_mwes=True)
|
||||
assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
|
||||
ents = list(tokens.ents)
|
||||
assert len(ents) == 1
|
||||
assert len(ents) == 1, ents
|
||||
assert ents[0].label_ == 'TIME', string
|
||||
if ents[0].start == 4 and ents[0].end == 5:
|
||||
assert ents[0].orth_ == '%s%s%s' % (num, space, var)
|
||||
|
|
|
@ -17,6 +17,7 @@ def lemmas(tagged):
|
|||
return [t.lemma_ for t in tagged]
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_lemmas(lemmas, tagged):
|
||||
assert lemmas[0] == 'banana'
|
||||
assert lemmas[1] == 'in'
|
||||
|
|
|
@ -12,6 +12,7 @@ def morph_exc():
|
|||
}
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_load_exc(morph_exc):
|
||||
# Do this local as we want to modify it
|
||||
nlp = English()
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
from spacy.en import English
|
||||
import six
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tag_names(EN):
|
||||
tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
|
||||
pizza = tokens[2]
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
|
||||
from spacy.en.attrs import IS_LOWER
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_1():
|
||||
import spacy.en
|
||||
from spacy.parts_of_speech import ADV
|
||||
|
@ -21,6 +22,7 @@ def test_1():
|
|||
assert o == -11.07155704498291
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test2():
|
||||
import spacy.en
|
||||
from spacy.parts_of_speech import ADV
|
||||
|
@ -41,6 +43,7 @@ def test2():
|
|||
-11.07155704498291
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test3():
|
||||
import spacy.en
|
||||
from spacy.parts_of_speech import ADV
|
||||
|
|
|
@ -15,6 +15,7 @@ def test_attr_of_token(EN):
|
|||
assert feats_array[0][0] != feats_array[0][1]
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tag(EN):
|
||||
text = u'A nice sentence.'
|
||||
tokens = EN(text)
|
||||
|
@ -26,6 +27,7 @@ def test_tag(EN):
|
|||
assert feats_array[3][1] == tokens[3].tag
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_dep(EN):
|
||||
text = u'A nice sentence.'
|
||||
tokens = EN(text)
|
||||
|
|
|
@ -4,6 +4,7 @@ import pytest
|
|||
from spacy.parts_of_speech import ADV
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_prob(EN):
|
||||
tokens = EN(u'Give it back', parse=False)
|
||||
give = tokens[0]
|
||||
|
|
|
@ -7,6 +7,7 @@ from spacy.en.attrs import IS_STOP
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_strings(EN):
|
||||
tokens = EN(u'Give it back! He pleaded.')
|
||||
token = tokens[0]
|
||||
|
|
|
@ -9,6 +9,7 @@ data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
|
|||
# Let this have its own instances, as we have to be careful about memory here
|
||||
# that's the point, after all
|
||||
|
||||
@pytest.mark.models
|
||||
def get_orphan_token(text, i):
|
||||
nlp = English(load_vectors=False, data_dir=data_dir)
|
||||
tokens = nlp(text)
|
||||
|
@ -18,6 +19,7 @@ def get_orphan_token(text, i):
|
|||
return token
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_orphan():
|
||||
orphan = get_orphan_token('An orphan token', 1)
|
||||
gc.collect()
|
||||
|
@ -36,6 +38,7 @@ def _orphan_from_list(toks):
|
|||
return lst
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_list_orphans():
|
||||
# Test case from NSchrading
|
||||
nlp = English(load_vectors=False, data_dir=data_dir)
|
||||
|
|
|
@ -5,7 +5,7 @@ from spacy.tokens import Doc
|
|||
import pytest
|
||||
|
||||
|
||||
def test_getitem(EN):
|
||||
def mest_getitem(EN):
|
||||
tokens = EN(u'Give it back! He pleaded.')
|
||||
assert tokens[0].orth_ == 'Give'
|
||||
assert tokens[-1].orth_ == '.'
|
||||
|
@ -13,10 +13,19 @@ def test_getitem(EN):
|
|||
tokens[len(tokens)]
|
||||
|
||||
|
||||
def test_serialize(EN):
|
||||
tokens = EN(u' Give it back! He pleaded. ')
|
||||
packed = tokens.serialize()
|
||||
new_tokens = Doc.deserialize(EN.vocab, packed)
|
||||
def mest_serialize(EN):
|
||||
tokens = EN(u'Give it back! He pleaded.')
|
||||
packed = tokens.to_bytes()
|
||||
new_tokens = Doc(EN.vocab).from_bytes(packed)
|
||||
assert tokens.string == new_tokens.string
|
||||
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
|
||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
||||
|
||||
def test_serialize_whitespace(EN):
|
||||
tokens = EN(u' Give it back! He pleaded. ')
|
||||
packed = tokens.to_bytes()
|
||||
new_tokens = Doc(EN.vocab).from_bytes(packed)
|
||||
assert tokens.string == new_tokens.string
|
||||
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
|
||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
|
|
@ -4,13 +4,14 @@ from spacy.en import English
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.vectors
|
||||
def test_vec(EN):
|
||||
hype = EN.vocab['hype']
|
||||
assert hype.orth_ == 'hype'
|
||||
assert 0.08 >= hype.repvec[0] > 0.07
|
||||
|
||||
|
||||
@pytest.mark.vectors
|
||||
def test_capitalized(EN):
|
||||
hype = EN.vocab['Hype']
|
||||
assert hype.orth_ == 'Hype'
|
||||
|
|
|
@ -39,7 +39,7 @@ def test_retrieve_id(sstore):
|
|||
|
||||
def test_med_string(sstore):
|
||||
nine_char_string = sstore[b'0123456789']
|
||||
assert sstore[nine_char_string] == b'0123456789'
|
||||
assert sstore[nine_char_string] == u'0123456789'
|
||||
dummy = sstore[b'A']
|
||||
assert sstore[b'0123456789'] == nine_char_string
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user