diff --git a/requirements.txt b/requirements.txt index 37259e747..aae0f9388 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pathlib numpy>=1.7 cymem>=1.30,<1.32 preshed>=1.0.0,<2.0.0 -thinc>=6.7.3,<6.8.0 +thinc>=6.8.0,<6.9.0 murmurhash>=0.28,<0.29 plac<1.0.0,>=0.9.6 six diff --git a/setup.py b/setup.py index df6afdb2c..ecdf15536 100755 --- a/setup.py +++ b/setup.py @@ -192,7 +192,7 @@ def setup_package(): 'murmurhash>=0.28,<0.29', 'cymem>=1.30,<1.32', 'preshed>=1.0.0,<2.0.0', - 'thinc>=6.7.3,<6.8.0', + 'thinc>=6.8.0,<6.9.0', 'plac<1.0.0,>=0.9.6', 'pip>=9.0.0,<10.0.0', 'six', diff --git a/spacy/__init__.py b/spacy/__init__.py index 068282b1a..1cb7c0cbd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -13,5 +13,10 @@ def load(name, **overrides): return util.load_model(name, **overrides) +def blank(name, **kwargs): + LangClass = util.get_lang_class(name) + return LangClass(**kwargs) + + def info(model=None, markdown=False): return cli_info(None, model, markdown) diff --git a/spacy/_ml.py b/spacy/_ml.py index 2d0910a53..f1ded666e 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -4,18 +4,22 @@ from thinc.neural import Model, Maxout, Softmax, Affine from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module +import random from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.static_vectors import StaticVectors from thinc.neural._classes.batchnorm import BatchNorm from thinc.neural._classes.resnet import Residual from thinc.neural import ReLu +from thinc.neural._classes.selu import SELU from thinc import describe from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.neural._classes.affine import _set_dimensions_if_needed from thinc.api import FeatureExtracter, with_getitem -from thinc.neural.pooling import Pooling, max_pool, mean_pool +from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool +from thinc.neural._classes.attention import ParametricAttention from thinc.linear.linear import LinearModel +from thinc.api import uniqued, wrap from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP from .tokens.doc import Doc @@ -367,7 +371,7 @@ def preprocess_doc(docs, drop=0.): def build_text_classifier(nr_class, width=64, **cfg): - nr_vector = cfg.get('nr_vector', 1000) + nr_vector = cfg.get('nr_vector', 200) with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}): embed_lower = HashEmbed(width, nr_vector, column=1) embed_prefix = HashEmbed(width//2, nr_vector, column=2) @@ -378,25 +382,26 @@ def build_text_classifier(nr_class, width=64, **cfg): FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE]) >> _flatten_add_lengths >> with_getitem(0, - (embed_lower | embed_prefix | embed_suffix | embed_shape) - >> Maxout(width, width+(width//2)*3) + uniqued( + (embed_lower | embed_prefix | embed_suffix | embed_shape) + >> Maxout(width, width+(width//2)*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) ) - >> Pooling(mean_pool, max_pool) - >> Residual(ReLu(width*2, width*2)) + >> ParametricAttention(width,) + >> Pooling(sum_pool) + >> ReLu(width, width) + >> zero_init(Affine(nr_class, width, drop_factor=0.0)) ) linear_model = ( _preprocess_doc - >> LinearModel(nr_class) - >> logistic + >> LinearModel(nr_class, drop_factor=0.) ) model = ( - #(linear_model | cnn_model) - cnn_model - >> zero_init(Affine(nr_class, width*2+nr_class, drop_factor=0.0)) + (linear_model | cnn_model) + >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) >> logistic ) diff --git a/spacy/about.py b/spacy/about.py index 7cee56422..4bac17503 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy-nightly' -__version__ = '2.0.0a4' +__version__ = '2.0.0a6' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Explosion AI' diff --git a/spacy/compat.py b/spacy/compat.py index 848ea816a..c2ab27d7e 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -5,6 +5,7 @@ import six import ftfy import sys import ujson +import itertools from thinc.neural.util import copy_array @@ -35,6 +36,7 @@ CudaStream = CudaStream cupy = cupy fix_text = ftfy.fix_text copy_array = copy_array +izip = getattr(itertools, 'izip', zip) is_python2 = six.PY2 is_python3 = six.PY3 diff --git a/spacy/language.py b/spacy/language.py index de25157fb..fad2e2119 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -10,6 +10,7 @@ from thinc.neural.optimizers import Adam, SGD import random import ujson from collections import OrderedDict +import itertools from .tokenizer import Tokenizer from .vocab import Vocab @@ -25,7 +26,7 @@ from .pipeline import SimilarityHook from .pipeline import TextCategorizer from . import about -from .compat import json_dumps +from .compat import json_dumps, izip from .attrs import IS_STOP from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH @@ -411,7 +412,7 @@ class Language(object): except StopIteration: pass - def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]): + def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]): """Process texts as a stream, and yield `Doc` objects in order. Supports GIL-free multi-threading. @@ -427,8 +428,16 @@ class Language(object): >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4): >>> assert doc.is_parsed """ + if tuples: + text_context1, text_context2 = itertools.tee(texts) + texts = (tc[0] for tc in text_context1) + contexts = (tc[1] for tc in text_context2) + docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size, + disable=disable) + for doc, context in izip(docs, contexts): + yield (doc, context) + return docs = (self.make_doc(text) for text in texts) - docs = texts for proc in self.pipeline: name = getattr(proc, 'name', None) if name in disable: diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index d48cae26d..947f0a1f1 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -417,7 +417,7 @@ class NeuralTagger(BaseThincComponent): ('vocab', lambda p: self.vocab.from_disk(p)), ('tag_map', load_tag_map), ('model', load_model), - ('cfg', lambda p: self.cfg.update(ujson.load(p.open()))), + ('cfg', lambda p: self.cfg.update(_load_cfg(p))) )) util.from_disk(path, deserialize, exclude) return self