Merge branch 'master' into spacy.io

2025-12-04 00:34:27 +03:00 · 2019-10-08 15:39:48 +02:00 · 2019-10-08 15:39:48 +02:00 · e8f284c741
commit e8f284c741
parent b0d4899473 8f76d6c9ef
25 changed files with 368 additions and 40 deletions
--- a/netlify.toml
+++ b/netlify.toml
@ -48,4 +48,6 @@ redirects = [
    {from = "/api/sentencesegmenter", to="/api/sentencizer"},
    {from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
    {from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
+    # Renamed universe projects
+    {from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true}
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,7 @@ numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 plac<1.0.0,>=0.9.6
 pathlib==1.0.1; python_version < "3.4"
+importlib_metadata>=0.23; python_version < "3.8"
 # Optional dependencies
 jsonschema>=2.6.0,<3.1.0
 # Development dependencies
--- a/setup.cfg
+++ b/setup.cfg
@ -39,6 +39,7 @@ setup_requires =
    murmurhash>=0.28.0,<1.1.0
    thinc>=7.1.1,<7.2.0
 install_requires =
+    setuptools
    numpy>=1.15.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
@ -50,6 +51,7 @@ install_requires =
    wasabi>=0.2.0,<1.1.0
    srsly>=0.1.0,<1.1.0
    pathlib==1.0.1; python_version < "3.4"
+    importlib_metadata>=0.23; python_version < "3.8"

 [options.extras_require]
 lookups =
--- a/spacy/init.py
+++ b/spacy/init.py
@ -14,6 +14,7 @@ from .glossary import explain
 from .about import __version__
 from .errors import Errors, Warnings, deprecation_warning
 from . import util
+from .util import register_architecture, get_architecture


 if sys.maxunicode == 65535:
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -953,16 +953,24 @@ class CharacterEmbed(Model):
        return output, backprop_character_embed


-def get_cossim_loss(yh, y):
+def get_cossim_loss(yh, y, ignore_zeros=False):
+    xp = get_array_module(yh)
+    # Find the zero vectors
+    if ignore_zeros:
+        zero_indices = xp.abs(y).sum(axis=1) == 0
    # Add a small constant to avoid 0 vectors
    yh = yh + 1e-8
    y = y + 1e-8
    # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
-    xp = get_array_module(yh)
    norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
    norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
    mul_norms = norm_yh * norm_y
    cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
    d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
-    loss = xp.abs(cosine - 1).sum()
+    losses = xp.abs(cosine - 1)
+    if ignore_zeros:
+        # If the target was a zero vector, don't count it in the loss.
+        d_yh[zero_indices] = 0
+        losses[zero_indices] = 0
+    loss = losses.sum()
    return loss, -d_yh
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -7,7 +7,7 @@ from ...gold import docs_to_json
 from ...util import get_lang_class, minibatch


-def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
+def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
    if lang is None:
        raise ValueError("No --lang specified, but tokenization required")
    json_docs = []
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -6,7 +6,6 @@ import requests
 import os
 import subprocess
 import sys
-import pkg_resources
 from wasabi import Printer

 from .link import link
@ -87,6 +86,8 @@ def download(model, direct=False, *pip_args):

 def require_package(name):
    try:
+        import pkg_resources
+
        pkg_resources.working_set.require(name)
        return True
    except:  # noqa: E722
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -35,6 +35,7 @@ from .train import _load_pretrained_tok2vec
    output_dir=("Directory to write models to on each epoch", "positional", None, str),
    width=("Width of CNN layers", "option", "cw", int),
    depth=("Depth of CNN layers", "option", "cd", int),
+    bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
    embed_rows=("Number of embedding rows", "option", "er", int),
    loss_func=(
        "Loss function to use for the objective. Either 'L2' or 'cosine'",
@ -80,6 +81,7 @@ def pretrain(
    output_dir,
    width=96,
    depth=4,
+    bilstm_depth=2,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
@ -116,6 +118,10 @@ def pretrain(
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
+    if has_gpu:
+        import torch
+
+        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
@ -151,7 +157,7 @@ def pretrain(
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
-            bilstm_depth=0,  # Requires PyTorch. Experimental.
+            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals, print_function

-import pkg_resources
 from pathlib import Path
 import sys
 import requests
@ -109,6 +108,8 @@ def get_model_links(compat):


 def get_model_pkgs(compat, all_models):
+    import pkg_resources
+
    pkgs = {}
    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
        package = pkg_name.replace("-", "_")
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -35,6 +35,11 @@ try:
 except ImportError:
    cupy = None

+try:  # Python 3.8
+    import importlib.metadata as importlib_metadata
+except ImportError:
+    import importlib_metadata  # noqa: F401
+
 try:
    from thinc.neural.optimizers import Optimizer  # noqa: F401
 except ImportError:
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -95,6 +95,7 @@ class Warnings(object):
            "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
            "If this is surprising, make sure you have the spacy-lookups-data "
            "package installed.")
+    W023 = ("Multiprocessing of Language.pipe is not supported in Python2. 'n_process' will be set to 1.")


@add_codes
@ -495,6 +496,8 @@ class Errors(object):
    E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
            "Lookups containing the lemmatization tables. See the docs for "
            "details: https://spacy.io/api/lemmatizer#init")
+    E174 = ("Architecture '{name}' not found in registry. Available "
+            "names: {names}")


@add_codes
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -184,7 +184,7 @@ _russian_lower = r"ёа-я"
 _russian_upper = r"ЁА-Я"
 _russian = r"ёа-яЁА-Я"

-_sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6"
+_sinhala = r"\u0D80-\u0DFF"

 _tatar_lower = r"әөүҗңһ"
 _tatar_upper = r"ӘӨҮҖҢҺ"
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,8 +1,11 @@
 # coding: utf8
 from __future__ import absolute_import, unicode_literals

+import atexit
 import random
 import itertools
+from warnings import warn
+from spacy.util import minibatch
 import weakref
 import functools
 from collections import OrderedDict
@ -10,6 +13,8 @@ from contextlib import contextmanager
 from copy import copy, deepcopy
 from thinc.neural import Model
 import srsly
+import multiprocessing as mp
+from itertools import chain, cycle

 from .tokenizer import Tokenizer
 from .vocab import Vocab
@ -21,7 +26,7 @@ from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
 from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .pipeline import EntityRuler
 from .pipeline import Morphologizer
-from .compat import izip, basestring_
+from .compat import izip, basestring_, is_python2
 from .gold import GoldParse
 from .scorer import Scorer
 from ._ml import link_vectors_to_models, create_default_optimizer
@ -30,8 +35,9 @@ from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
 from .lang.tag_map import TAG_MAP
+from .tokens import Doc
 from .lang.lex_attrs import LEX_ATTRS, is_stop
-from .errors import Errors, Warnings, deprecation_warning
+from .errors import Errors, Warnings, deprecation_warning, user_warning
 from . import util
 from . import about

@ -733,6 +739,7 @@ class Language(object):
        disable=[],
        cleanup=False,
        component_cfg=None,
+        n_process=1,
    ):
        """Process texts as a stream, and yield `Doc` objects in order.

@ -746,12 +753,20 @@ class Language(object):
            use. Experimental.
        component_cfg (dict): An optional dictionary with extra keyword
            arguments for specific components.
+        n_process (int): Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`.
        YIELDS (Doc): Documents in the order of the original text.

        DOCS: https://spacy.io/api/language#pipe
        """
+        # raw_texts will be used later to stop iterator.
+        texts, raw_texts = itertools.tee(texts)
+        if is_python2 and n_process != 1:
+            user_warning(Warnings.W023)
+            n_process = 1
        if n_threads != -1:
            deprecation_warning(Warnings.W016)
+        if n_process == -1:
+            n_process = mp.cpu_count()
        if as_tuples:
            text_context1, text_context2 = itertools.tee(texts)
            texts = (tc[0] for tc in text_context1)
@ -765,9 +780,12 @@ class Language(object):
            for doc, context in izip(docs, contexts):
                yield (doc, context)
            return
-        docs = (self.make_doc(text) for text in texts)
        if component_cfg is None:
            component_cfg = {}
+
+        pipes = (
+            []
+        )  # contains functools.partial objects so that easily create multiprocess worker.
        for name, proc in self.pipeline:
            if name in disable:
                continue
@ -775,10 +793,20 @@ class Language(object):
            # Allow component_cfg to overwrite the top-level kwargs.
            kwargs.setdefault("batch_size", batch_size)
            if hasattr(proc, "pipe"):
-                docs = proc.pipe(docs, **kwargs)
+                f = functools.partial(proc.pipe, **kwargs)
            else:
                # Apply the function, but yield the doc
-                docs = _pipe(proc, docs, kwargs)
+                f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
+            pipes.append(f)
+
+        if n_process != 1:
+            docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
+        else:
+            # if n_process == 1, no processes are forked.
+            docs = (self.make_doc(text) for text in texts)
+            for pipe in pipes:
+                docs = pipe(docs)
+
        # Track weakrefs of "recent" documents, so that we can see when they
        # expire from memory. When they do, we know we don't need old strings.
        # This way, we avoid maintaining an unbounded growth in string entries
@ -809,6 +837,46 @@ class Language(object):
                        self.tokenizer._reset_cache(keys)
                    nr_seen = 0

+    def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size):
+        # raw_texts is used later to stop iteration.
+        texts, raw_texts = itertools.tee(texts)
+        # for sending texts to worker
+        texts_q = [mp.Queue() for _ in range(n_process)]
+        # for receiving byte encoded docs from worker
+        bytedocs_recv_ch, bytedocs_send_ch = zip(
+            *[mp.Pipe(False) for _ in range(n_process)]
+        )
+
+        batch_texts = minibatch(texts, batch_size)
+        # Sender sends texts to the workers.
+        # This is necessary to properly handle infinite length of texts.
+        # (In this case, all data cannot be sent to the workers at once)
+        sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
+        # send twice so that make process busy
+        sender.send()
+        sender.send()
+
+        procs = [
+            mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch))
+            for rch, sch in zip(texts_q, bytedocs_send_ch)
+        ]
+        for proc in procs:
+            proc.start()
+
+        # Cycle channels not to break the order of docs.
+        # The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
+        byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
+        docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
+        try:
+            for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
+                yield doc
+                if i % batch_size == 0:
+                    # tell `sender` that one batch was consumed.
+                    sender.step()
+        finally:
+            for proc in procs:
+                proc.terminate()
+
    def to_disk(self, path, exclude=tuple(), disable=None):
        """Save the current state to a directory.  If a model is loaded, this
        will include the model.
@ -987,12 +1055,55 @@ class DisabledPipes(list):
        self[:] = []


-def _pipe(func, docs, kwargs):
+def _pipe(docs, proc, kwargs):
    # We added some args for pipe that __call__ doesn't expect.
    kwargs = dict(kwargs)
    for arg in ["n_threads", "batch_size"]:
        if arg in kwargs:
            kwargs.pop(arg)
    for doc in docs:
-        doc = func(doc, **kwargs)
+        doc = proc(doc, **kwargs)
        yield doc
+
+
+def _apply_pipes(make_doc, pipes, reciever, sender):
+    """Worker for Language.pipe
+
+    Args:
+        receiver (multiprocessing.Connection): Pipe to receive text. Usually created by `multiprocessing.Pipe()`
+        sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()`
+    """
+    while True:
+        texts = reciever.get()
+        docs = (make_doc(text) for text in texts)
+        for pipe in pipes:
+            docs = pipe(docs)
+        # Connection does not accept unpickable objects, so send list.
+        sender.send([doc.to_bytes() for doc in docs])
+
+
+class _Sender:
+    """Util for sending data to multiprocessing workers in Language.pipe"""
+
+    def __init__(self, data, queues, chunk_size):
+        self.data = iter(data)
+        self.queues = iter(cycle(queues))
+        self.chunk_size = chunk_size
+        self.count = 0
+
+    def send(self):
+        """Send chunk_size items from self.data to channels."""
+        for item, q in itertools.islice(
+            zip(self.data, cycle(self.queues)), self.chunk_size
+        ):
+            # cycle channels so that distribute the texts evenly
+            q.put(item)
+
+    def step(self):
+        """Tell sender that comsumed one item. 
+
+        Data is sent to the workers after every chunk_size calls."""
+        self.count += 1
+        if self.count >= self.chunk_size:
+            self.count = 0
+            self.send()
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -225,7 +225,7 @@ cdef class PhraseMatcher:
        for i in range(c_matches.size()):
            matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
        for i, (ent_id, start, end) in enumerate(matches):
-            on_match = self._callbacks.get(ent_id)
+            on_match = self._callbacks.get(self.vocab.strings[ent_id])
            if on_match is not None:
                on_match(self, doc, i, matches)
        return matches
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -29,7 +29,7 @@ from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
 from .._ml import build_text_classifier, build_simple_cnn_text_classifier
 from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
-from .._ml import masked_language_model, create_default_optimizer
+from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
 from ..errors import Errors, TempErrors, user_warning, Warnings
 from .. import util

@ -880,8 +880,7 @@ class ClozeMultitask(Pipe):
        # and look them up all at once. This prevents data copying.
        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
        target = vectors[ids]
-        gradient = (prediction - target) / prediction.shape[0]
-        loss = (gradient**2).sum()
+        loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
        return float(loss), gradient

    def update(self, docs, golds, drop=0., sgd=None, losses=None):
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals

 import pytest
 import re
+from mock import Mock
 from spacy.matcher import Matcher, DependencyMatcher
 from spacy.tokens import Doc, Token

@ -418,3 +419,13 @@ def test_matcher_valid_callback(en_vocab):
    with pytest.raises(ValueError):
        matcher.add("TEST", [], [{"TEXT": "test"}])
    matcher(Doc(en_vocab, words=["test"]))
+
+
+def test_matcher_callback(en_vocab):
+    mock = Mock()
+    matcher = Matcher(en_vocab)
+    pattern = [{"ORTH": "test"}]
+    matcher.add("Rule", mock, pattern)
+    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+    matches = matcher(doc)
+    mock.assert_called_once_with(matcher, doc, 0, matches)
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 import pytest
+from mock import Mock
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 from ..util import get_doc
@ -215,3 +216,13 @@ def test_attr_pipeline_checks(en_vocab):
    matcher.add("TEST3", None, doc3)
    matcher = PhraseMatcher(en_vocab, attr="TEXT")
    matcher.add("TEST3", None, doc3)
+
+
+def test_phrase_matcher_callback(en_vocab):
+    mock = Mock()
+    doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
+    pattern = Doc(en_vocab, words=["Google", "Now"])
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("COMPANY", mock, pattern)
+    matches = matcher(doc)
+    mock.assert_called_once_with(matcher, doc, 0, matches)
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 import pytest
+import spacy
 from spacy.pipeline import Sentencizer
 from spacy.tokens import Doc

@ -85,3 +86,26 @@ def test_sentencizer_serialize_bytes(en_vocab):
    bytes_data = sentencizer.to_bytes()
    new_sentencizer = Sentencizer().from_bytes(bytes_data)
    assert new_sentencizer.punct_chars == set(punct_chars)
+
+
+@pytest.mark.parametrize(
+    # fmt: off
+    "lang,text",
+    [
+        ('bn', 'বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।'),
+        ('de', 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.'),
+        ('hi', 'हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]'),
+        ('kn', 'ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]'),
+        ('si', 'ශ්‍රී ලංකාවේ ප්‍රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල‍ ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්‍රී ලංකාවේ නිල භාෂාවයි .'),
+        ('ta', 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]'),
+        ('te', 'ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.'),
+        ('ur', 'اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔'),
+    ],
+    # fmt: on
+)
+def test_sentencizer_across_scripts(lang, text):
+    nlp = spacy.blank(lang)
+    sentencizer = Sentencizer()
+    nlp.add_pipe(sentencizer)
+    doc = nlp(text)
+    assert len(list(doc.sents)) > 1
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -1,11 +1,16 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import itertools
+
 import pytest
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.tokens import Doc
+from spacy.compat import is_python2
 from spacy.gold import GoldParse
+from spacy.language import Language
+from spacy.tokens import Doc, Span
+from spacy.vocab import Vocab
+
+from .util import add_vecs_to_vocab, assert_docs_equal


@pytest.fixture
@ -58,3 +63,74 @@ def test_language_evaluate(nlp):
    # Evaluate badly
    with pytest.raises(Exception):
        nlp.evaluate([text, gold])
+
+
+def vector_modification_pipe(doc):
+    doc.vector += 1
+    return doc
+
+
+def userdata_pipe(doc):
+    doc.user_data["foo"] = "bar"
+    return doc
+
+
+def ner_pipe(doc):
+    span = Span(doc, 0, 1, label="FIRST")
+    doc.ents += (span,)
+    return doc
+
+
+@pytest.fixture
+def sample_vectors():
+    return [
+        ("spacy", [-0.1, -0.2, -0.3]),
+        ("world", [-0.2, -0.3, -0.4]),
+        ("pipe", [0.7, 0.8, 0.9]),
+    ]
+
+
+@pytest.fixture
+def nlp2(nlp, sample_vectors):
+    add_vecs_to_vocab(nlp.vocab, sample_vectors)
+    nlp.add_pipe(vector_modification_pipe)
+    nlp.add_pipe(ner_pipe)
+    nlp.add_pipe(userdata_pipe)
+    return nlp
+
+
+@pytest.fixture
+def texts():
+    data = [
+        "Hello world.",
+        "This is spacy.",
+        "You can use multiprocessing with pipe method.",
+        "Please try!",
+    ]
+    return data
+
+
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe(nlp2, n_process, texts):
+    texts = texts * 10
+    expecteds = [nlp2(text) for text in texts]
+    docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
+
+    for doc, expected_doc in zip(docs, expecteds):
+        assert_docs_equal(doc, expected_doc)
+
+
+@pytest.mark.skipif(
+    is_python2, reason="python2 seems to be unable to handle iterator properly"
+)
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe_stream(nlp2, n_process, texts):
+    # check if nlp.pipe can handle infinite length iterator properly.
+    stream_texts = itertools.cycle(texts)
+    texts0, texts1 = itertools.tee(stream_texts)
+    expecteds = (nlp2(text) for text in texts0)
+    docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
+
+    n_fetch = 20
+    for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
+        assert_docs_equal(doc, expected_doc)
--- a/spacy/tests/test_register_architecture.py
+++ b/spacy/tests/test_register_architecture.py
@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy import register_architecture
+from spacy import get_architecture
+from thinc.v2v import Affine
+
+
+@register_architecture("my_test_function")
+def create_model(nr_in, nr_out):
+    return Affine(nr_in, nr_out)
+
+
+def test_get_architecture():
+    arch = get_architecture("my_test_function")
+    assert arch is create_model
+    with pytest.raises(KeyError):
+        get_architecture("not_an_existing_key")
--- a/spacy/util.py
+++ b/spacy/util.py
@ -2,7 +2,6 @@
 from __future__ import unicode_literals, print_function

 import os
-import pkg_resources
 import importlib
 import re
 from pathlib import Path
@ -28,15 +27,21 @@ except ImportError:

 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, unicode_
-from .compat import import_file
+from .compat import import_file, importlib_metadata
 from .errors import Errors, Warnings, deprecation_warning


 LANGUAGES = {}
+ARCHITECTURES = {}
 _data_path = Path(__file__).parent / "data"
 _PRINT_ENV = False


+# NB: Ony ever call this once! If called more than ince within the
+# function, test_issue1506 hangs and it's not 100% clear why.
+AVAILABLE_ENTRY_POINTS = importlib_metadata.entry_points()
+
+
 class ENTRY_POINTS(object):
    """Available entry points to register extensions."""

@ -44,6 +49,7 @@ class ENTRY_POINTS(object):
    languages = "spacy_languages"
    displacy_colors = "spacy_displacy_colors"
    lookups = "spacy_lookups"
+    architectures = "spacy_architectures"


 def set_env_log(value):
@ -115,6 +121,44 @@ def set_data_path(path):
    _data_path = ensure_path(path)


+def register_architecture(name, arch=None):
+    """Decorator to register an architecture. An architecture is a function
+    that returns a Thinc Model object.
+
+    name (unicode): The name of the architecture to register.
+    arch (Model): Optional architecture if function is called directly and
+        not used as a decorator.
+    RETURNS (callable): Function to register architecture.
+    """
+    global ARCHITECTURES
+    if arch is not None:
+        ARCHITECTURES[name] = arch
+        return arch
+
+    def do_registration(arch):
+        ARCHITECTURES[name] = arch
+        return arch
+
+    return do_registration
+
+
+def get_architecture(name):
+    """Get a model architecture function by name. Raises a KeyError if the
+    architecture is not found.
+
+    name (unicode): The mame of the architecture.
+    RETURNS (Model): The architecture.
+    """
+    # Check if an entry point is exposed for the architecture code
+    entry_point = get_entry_point(ENTRY_POINTS.architectures, name)
+    if entry_point is not None:
+        ARCHITECTURES[name] = entry_point
+    if name not in ARCHITECTURES:
+        names = ", ".join(sorted(ARCHITECTURES.keys()))
+        raise KeyError(Errors.E174.format(name=name, names=names))
+    return ARCHITECTURES[name]
+
+
 def ensure_path(path):
    """Ensure string is converted to a Path.

@ -253,6 +297,8 @@ def is_package(name):
    name (unicode): Name of package.
    RETURNS (bool): True if installed package, False if not.
    """
+    import pkg_resources
+
    name = name.lower()  # compare package name against lowercase name
    packages = pkg_resources.working_set.by_key.keys()
    for package in packages:
@ -282,7 +328,7 @@ def get_entry_points(key):
    RETURNS (dict): Entry points, keyed by name.
    """
    result = {}
-    for entry_point in pkg_resources.iter_entry_points(key):
+    for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
        result[entry_point.name] = entry_point.load()
    return result

@ -296,7 +342,7 @@ def get_entry_point(key, value, default=None):
    default: Optional default value to return.
    RETURNS: The loaded entry point or None.
    """
-    for entry_point in pkg_resources.iter_entry_points(key):
+    for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
        if entry_point.name == value:
            return entry_point.load()
    return default
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -337,7 +337,7 @@ cdef class Vectors:
            scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]

            if sort:
-                sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores, axis=1)[:,::-1]
+                sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
                scores[i:i+batch_size] = scores[sorted_index]
                best_rows[i:i+batch_size] = best_rows[sorted_index]

--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -8,10 +8,10 @@
                "en_core_web_md",
                "en_core_web_lg",
                "en_vectors_web_lg",
-                "en_pytt_bertbaseuncased_lg",
-                "en_pytt_robertabase_lg",
-                "en_pytt_distilbertbaseuncased_lg",
-                "en_pytt_xlnetbasecased_lg"
+                "en_trf_bertbaseuncased_lg",
+                "en_trf_robertabase_lg",
+                "en_trf_distilbertbaseuncased_lg",
+                "en_trf_xlnetbasecased_lg"
            ],
            "example": "This is a sentence.",
            "has_examples": true
@ -19,7 +19,7 @@
        {
            "code": "de",
            "name": "German",
-            "models": ["de_core_news_sm", "de_core_news_md", "de_pytt_bertbasecased_lg"],
+            "models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"],
            "example": "Dies ist ein Satz.",
            "has_examples": true
        },
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1675,21 +1675,21 @@
            }
        },
        {
-            "id": "spacy-pytorch-transformers",
-            "title": "spacy-pytorch-transformers",
+            "id": "spacy-transformers",
+            "title": "spacy-transformers",
            "slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
-            "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
-            "github": "explosion/spacy-pytorch-transformers",
-            "url": "https://explosion.ai/blog/spacy-pytorch-transformers",
-            "pip": "spacy-pytorch-transformers",
+            "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `transformers`](https://github.com/huggingface/transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
+            "github": "explosion/spacy-transformers",
+            "url": "https://explosion.ai/blog/spacy-transformers",
+            "pip": "spacy-transformers",
            "category": ["pipeline", "models", "research"],
            "code_example": [
                "import spacy",
                "",
-                "nlp = spacy.load(\"en_pytt_bertbaseuncased_lg\")",
+                "nlp = spacy.load(\"en_trf_bertbaseuncased_lg\")",
                "doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")",
                "print(doc[0].similarity(doc[7]))",
-                "print(doc._.pytt_last_hidden_state.shape)"
+                "print(doc._.trf_last_hidden_state.shape)"
            ],
            "author": "Explosion",
            "author_links": {
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@ -23,6 +23,7 @@ const MODEL_META = {
    dep: 'Vocabulary, syntax',
    ent: 'Named entities',
    pytt: 'PyTorch Transformers',
+    trf: 'Transformers',
    vectors: 'Word vectors',
    web: 'written text (blogs, news, comments)',
    news: 'written text (news, media)',