Merge remote-tracking branch 'upstream/develop' into indonesian

This commit is contained in:
Jim Geovedi 2017-07-27 10:55:34 +07:00
commit c194f7ae26
8 changed files with 39 additions and 18 deletions

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7
cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0
thinc>=6.7.3,<6.8.0
thinc>=6.8.0,<6.9.0
murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6
six

View File

@ -192,7 +192,7 @@ def setup_package():
'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0',
'thinc>=6.7.3,<6.8.0',
'thinc>=6.8.0,<6.9.0',
'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0',
'six',

View File

@ -13,5 +13,10 @@ def load(name, **overrides):
return util.load_model(name, **overrides)
def blank(name, **kwargs):
LangClass = util.get_lang_class(name)
return LangClass(**kwargs)
def info(model=None, markdown=False):
return cli_info(None, model, markdown)

View File

@ -4,18 +4,22 @@ from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
import random
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm
from thinc.neural._classes.resnet import Residual
from thinc.neural import ReLu
from thinc.neural._classes.selu import SELU
from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed
from thinc.api import FeatureExtracter, with_getitem
from thinc.neural.pooling import Pooling, max_pool, mean_pool
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
from thinc.neural._classes.attention import ParametricAttention
from thinc.linear.linear import LinearModel
from thinc.api import uniqued, wrap
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
from .tokens.doc import Doc
@ -367,7 +371,7 @@ def preprocess_doc(docs, drop=0.):
def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 1000)
nr_vector = cfg.get('nr_vector', 200)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}):
embed_lower = HashEmbed(width, nr_vector, column=1)
embed_prefix = HashEmbed(width//2, nr_vector, column=2)
@ -378,25 +382,26 @@ def build_text_classifier(nr_class, width=64, **cfg):
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE])
>> _flatten_add_lengths
>> with_getitem(0,
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(width, width+(width//2)*3)
uniqued(
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(width, width+(width//2)*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
)
>> Pooling(mean_pool, max_pool)
>> Residual(ReLu(width*2, width*2))
>> ParametricAttention(width,)
>> Pooling(sum_pool)
>> ReLu(width, width)
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
)
linear_model = (
_preprocess_doc
>> LinearModel(nr_class)
>> logistic
>> LinearModel(nr_class, drop_factor=0.)
)
model = (
#(linear_model | cnn_model)
cnn_model
>> zero_init(Affine(nr_class, width*2+nr_class, drop_factor=0.0))
(linear_model | cnn_model)
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
>> logistic
)

View File

@ -3,7 +3,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy-nightly'
__version__ = '2.0.0a4'
__version__ = '2.0.0a6'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Explosion AI'

View File

@ -5,6 +5,7 @@ import six
import ftfy
import sys
import ujson
import itertools
from thinc.neural.util import copy_array
@ -35,6 +36,7 @@ CudaStream = CudaStream
cupy = cupy
fix_text = ftfy.fix_text
copy_array = copy_array
izip = getattr(itertools, 'izip', zip)
is_python2 = six.PY2
is_python3 = six.PY3

View File

@ -10,6 +10,7 @@ from thinc.neural.optimizers import Adam, SGD
import random
import ujson
from collections import OrderedDict
import itertools
from .tokenizer import Tokenizer
from .vocab import Vocab
@ -25,7 +26,7 @@ from .pipeline import SimilarityHook
from .pipeline import TextCategorizer
from . import about
from .compat import json_dumps
from .compat import json_dumps, izip
from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
@ -411,7 +412,7 @@ class Language(object):
except StopIteration:
pass
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
@ -427,8 +428,16 @@ class Language(object):
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
if tuples:
text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
contexts = (tc[1] for tc in text_context2)
docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
disable=disable)
for doc, context in izip(docs, contexts):
yield (doc, context)
return
docs = (self.make_doc(text) for text in texts)
docs = texts
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disable:

View File

@ -417,7 +417,7 @@ class NeuralTagger(BaseThincComponent):
('vocab', lambda p: self.vocab.from_disk(p)),
('tag_map', load_tag_map),
('model', load_model),
('cfg', lambda p: self.cfg.update(ujson.load(p.open()))),
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
))
util.from_disk(path, deserialize, exclude)
return self