Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-10-08 15:39:48 +02:00
commit e8f284c741
25 changed files with 368 additions and 40 deletions

View File

@ -48,4 +48,6 @@ redirects = [
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
# Renamed universe projects
{from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true}
]

View File

@ -11,6 +11,7 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac<1.0.0,>=0.9.6
pathlib==1.0.1; python_version < "3.4"
importlib_metadata>=0.23; python_version < "3.8"
# Optional dependencies
jsonschema>=2.6.0,<3.1.0
# Development dependencies

View File

@ -39,6 +39,7 @@ setup_requires =
murmurhash>=0.28.0,<1.1.0
thinc>=7.1.1,<7.2.0
install_requires =
setuptools
numpy>=1.15.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
@ -50,6 +51,7 @@ install_requires =
wasabi>=0.2.0,<1.1.0
srsly>=0.1.0,<1.1.0
pathlib==1.0.1; python_version < "3.4"
importlib_metadata>=0.23; python_version < "3.8"
[options.extras_require]
lookups =

View File

@ -14,6 +14,7 @@ from .glossary import explain
from .about import __version__
from .errors import Errors, Warnings, deprecation_warning
from . import util
from .util import register_architecture, get_architecture
if sys.maxunicode == 65535:

View File

@ -953,16 +953,24 @@ class CharacterEmbed(Model):
return output, backprop_character_embed
def get_cossim_loss(yh, y):
def get_cossim_loss(yh, y, ignore_zeros=False):
xp = get_array_module(yh)
# Find the zero vectors
if ignore_zeros:
zero_indices = xp.abs(y).sum(axis=1) == 0
# Add a small constant to avoid 0 vectors
yh = yh + 1e-8
y = y + 1e-8
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
xp = get_array_module(yh)
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
mul_norms = norm_yh * norm_y
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
loss = xp.abs(cosine - 1).sum()
losses = xp.abs(cosine - 1)
if ignore_zeros:
# If the target was a zero vector, don't count it in the loss.
d_yh[zero_indices] = 0
losses[zero_indices] = 0
loss = losses.sum()
return loss, -d_yh

View File

@ -7,7 +7,7 @@ from ...gold import docs_to_json
from ...util import get_lang_class, minibatch
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
json_docs = []

View File

@ -6,7 +6,6 @@ import requests
import os
import subprocess
import sys
import pkg_resources
from wasabi import Printer
from .link import link
@ -87,6 +86,8 @@ def download(model, direct=False, *pip_args):
def require_package(name):
try:
import pkg_resources
pkg_resources.working_set.require(name)
return True
except: # noqa: E722

View File

@ -35,6 +35,7 @@ from .train import _load_pretrained_tok2vec
output_dir=("Directory to write models to on each epoch", "positional", None, str),
width=("Width of CNN layers", "option", "cw", int),
depth=("Depth of CNN layers", "option", "cd", int),
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
embed_rows=("Number of embedding rows", "option", "er", int),
loss_func=(
"Loss function to use for the objective. Either 'L2' or 'cosine'",
@ -80,6 +81,7 @@ def pretrain(
output_dir,
width=96,
depth=4,
bilstm_depth=2,
embed_rows=2000,
loss_func="cosine",
use_vectors=False,
@ -116,6 +118,10 @@ def pretrain(
util.fix_random_seed(seed)
has_gpu = prefer_gpu()
if has_gpu:
import torch
torch.set_default_tensor_type("torch.cuda.FloatTensor")
msg.info("Using GPU" if has_gpu else "Not using GPU")
output_dir = Path(output_dir)
@ -151,7 +157,7 @@ def pretrain(
embed_rows,
conv_depth=depth,
pretrained_vectors=pretrained_vectors,
bilstm_depth=0, # Requires PyTorch. Experimental.
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
cnn_maxout_pieces=3, # You can try setting this higher
subword_features=True, # Set to False for Chinese etc
),

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals, print_function
import pkg_resources
from pathlib import Path
import sys
import requests
@ -109,6 +108,8 @@ def get_model_links(compat):
def get_model_pkgs(compat, all_models):
import pkg_resources
pkgs = {}
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
package = pkg_name.replace("-", "_")

View File

@ -35,6 +35,11 @@ try:
except ImportError:
cupy = None
try: # Python 3.8
import importlib.metadata as importlib_metadata
except ImportError:
import importlib_metadata # noqa: F401
try:
from thinc.neural.optimizers import Optimizer # noqa: F401
except ImportError:

View File

@ -95,6 +95,7 @@ class Warnings(object):
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
"If this is surprising, make sure you have the spacy-lookups-data "
"package installed.")
W023 = ("Multiprocessing of Language.pipe is not supported in Python2. 'n_process' will be set to 1.")
@add_codes
@ -495,6 +496,8 @@ class Errors(object):
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
"Lookups containing the lemmatization tables. See the docs for "
"details: https://spacy.io/api/lemmatizer#init")
E174 = ("Architecture '{name}' not found in registry. Available "
"names: {names}")
@add_codes

View File

@ -184,7 +184,7 @@ _russian_lower = r"ёа-я"
_russian_upper = r"ЁА-Я"
_russian = r"ёа-яЁА-Я"
_sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6"
_sinhala = r"\u0D80-\u0DFF"
_tatar_lower = r"әөүҗңһ"
_tatar_upper = r"ӘӨҮҖҢҺ"

View File

@ -1,8 +1,11 @@
# coding: utf8
from __future__ import absolute_import, unicode_literals
import atexit
import random
import itertools
from warnings import warn
from spacy.util import minibatch
import weakref
import functools
from collections import OrderedDict
@ -10,6 +13,8 @@ from contextlib import contextmanager
from copy import copy, deepcopy
from thinc.neural import Model
import srsly
import multiprocessing as mp
from itertools import chain, cycle
from .tokenizer import Tokenizer
from .vocab import Vocab
@ -21,7 +26,7 @@ from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler
from .pipeline import Morphologizer
from .compat import izip, basestring_
from .compat import izip, basestring_, is_python2
from .gold import GoldParse
from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer
@ -30,8 +35,9 @@ from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .tokens import Doc
from .lang.lex_attrs import LEX_ATTRS, is_stop
from .errors import Errors, Warnings, deprecation_warning
from .errors import Errors, Warnings, deprecation_warning, user_warning
from . import util
from . import about
@ -733,6 +739,7 @@ class Language(object):
disable=[],
cleanup=False,
component_cfg=None,
n_process=1,
):
"""Process texts as a stream, and yield `Doc` objects in order.
@ -746,12 +753,20 @@ class Language(object):
use. Experimental.
component_cfg (dict): An optional dictionary with extra keyword
arguments for specific components.
n_process (int): Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`.
YIELDS (Doc): Documents in the order of the original text.
DOCS: https://spacy.io/api/language#pipe
"""
# raw_texts will be used later to stop iterator.
texts, raw_texts = itertools.tee(texts)
if is_python2 and n_process != 1:
user_warning(Warnings.W023)
n_process = 1
if n_threads != -1:
deprecation_warning(Warnings.W016)
if n_process == -1:
n_process = mp.cpu_count()
if as_tuples:
text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
@ -765,9 +780,12 @@ class Language(object):
for doc, context in izip(docs, contexts):
yield (doc, context)
return
docs = (self.make_doc(text) for text in texts)
if component_cfg is None:
component_cfg = {}
pipes = (
[]
) # contains functools.partial objects so that easily create multiprocess worker.
for name, proc in self.pipeline:
if name in disable:
continue
@ -775,10 +793,20 @@ class Language(object):
# Allow component_cfg to overwrite the top-level kwargs.
kwargs.setdefault("batch_size", batch_size)
if hasattr(proc, "pipe"):
docs = proc.pipe(docs, **kwargs)
f = functools.partial(proc.pipe, **kwargs)
else:
# Apply the function, but yield the doc
docs = _pipe(proc, docs, kwargs)
f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
pipes.append(f)
if n_process != 1:
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
else:
# if n_process == 1, no processes are forked.
docs = (self.make_doc(text) for text in texts)
for pipe in pipes:
docs = pipe(docs)
# Track weakrefs of "recent" documents, so that we can see when they
# expire from memory. When they do, we know we don't need old strings.
# This way, we avoid maintaining an unbounded growth in string entries
@ -809,6 +837,46 @@ class Language(object):
self.tokenizer._reset_cache(keys)
nr_seen = 0
def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size):
# raw_texts is used later to stop iteration.
texts, raw_texts = itertools.tee(texts)
# for sending texts to worker
texts_q = [mp.Queue() for _ in range(n_process)]
# for receiving byte encoded docs from worker
bytedocs_recv_ch, bytedocs_send_ch = zip(
*[mp.Pipe(False) for _ in range(n_process)]
)
batch_texts = minibatch(texts, batch_size)
# Sender sends texts to the workers.
# This is necessary to properly handle infinite length of texts.
# (In this case, all data cannot be sent to the workers at once)
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
# send twice so that make process busy
sender.send()
sender.send()
procs = [
mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch))
for rch, sch in zip(texts_q, bytedocs_send_ch)
]
for proc in procs:
proc.start()
# Cycle channels not to break the order of docs.
# The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
try:
for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
yield doc
if i % batch_size == 0:
# tell `sender` that one batch was consumed.
sender.step()
finally:
for proc in procs:
proc.terminate()
def to_disk(self, path, exclude=tuple(), disable=None):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
@ -987,12 +1055,55 @@ class DisabledPipes(list):
self[:] = []
def _pipe(func, docs, kwargs):
def _pipe(docs, proc, kwargs):
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)
for arg in ["n_threads", "batch_size"]:
if arg in kwargs:
kwargs.pop(arg)
for doc in docs:
doc = func(doc, **kwargs)
doc = proc(doc, **kwargs)
yield doc
def _apply_pipes(make_doc, pipes, reciever, sender):
"""Worker for Language.pipe
Args:
receiver (multiprocessing.Connection): Pipe to receive text. Usually created by `multiprocessing.Pipe()`
sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()`
"""
while True:
texts = reciever.get()
docs = (make_doc(text) for text in texts)
for pipe in pipes:
docs = pipe(docs)
# Connection does not accept unpickable objects, so send list.
sender.send([doc.to_bytes() for doc in docs])
class _Sender:
"""Util for sending data to multiprocessing workers in Language.pipe"""
def __init__(self, data, queues, chunk_size):
self.data = iter(data)
self.queues = iter(cycle(queues))
self.chunk_size = chunk_size
self.count = 0
def send(self):
"""Send chunk_size items from self.data to channels."""
for item, q in itertools.islice(
zip(self.data, cycle(self.queues)), self.chunk_size
):
# cycle channels so that distribute the texts evenly
q.put(item)
def step(self):
"""Tell sender that comsumed one item.
Data is sent to the workers after every chunk_size calls."""
self.count += 1
if self.count >= self.chunk_size:
self.count = 0
self.send()

View File

@ -225,7 +225,7 @@ cdef class PhraseMatcher:
for i in range(c_matches.size()):
matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
on_match = self._callbacks.get(self.vocab.strings[ent_id])
if on_match is not None:
on_match(self, doc, i, matches)
return matches

View File

@ -29,7 +29,7 @@ from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
from .._ml import build_bow_text_classifier, build_nel_encoder
from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import masked_language_model, create_default_optimizer
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
from ..errors import Errors, TempErrors, user_warning, Warnings
from .. import util
@ -880,8 +880,7 @@ class ClozeMultitask(Pipe):
# and look them up all at once. This prevents data copying.
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = vectors[ids]
gradient = (prediction - target) / prediction.shape[0]
loss = (gradient**2).sum()
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
return float(loss), gradient
def update(self, docs, golds, drop=0., sgd=None, losses=None):

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import pytest
import re
from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token
@ -418,3 +419,13 @@ def test_matcher_valid_callback(en_vocab):
with pytest.raises(ValueError):
matcher.add("TEST", [], [{"TEXT": "test"}])
matcher(Doc(en_vocab, words=["test"]))
def test_matcher_callback(en_vocab):
mock = Mock()
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "test"}]
matcher.add("Rule", mock, pattern)
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
matches = matcher(doc)
mock.assert_called_once_with(matcher, doc, 0, matches)

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
from mock import Mock
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
from ..util import get_doc
@ -215,3 +216,13 @@ def test_attr_pipeline_checks(en_vocab):
matcher.add("TEST3", None, doc3)
matcher = PhraseMatcher(en_vocab, attr="TEXT")
matcher.add("TEST3", None, doc3)
def test_phrase_matcher_callback(en_vocab):
mock = Mock()
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
pattern = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab)
matcher.add("COMPANY", mock, pattern)
matches = matcher(doc)
mock.assert_called_once_with(matcher, doc, 0, matches)

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
import spacy
from spacy.pipeline import Sentencizer
from spacy.tokens import Doc
@ -85,3 +86,26 @@ def test_sentencizer_serialize_bytes(en_vocab):
bytes_data = sentencizer.to_bytes()
new_sentencizer = Sentencizer().from_bytes(bytes_data)
assert new_sentencizer.punct_chars == set(punct_chars)
@pytest.mark.parametrize(
# fmt: off
"lang,text",
[
('bn', 'বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।'),
('de', 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.'),
('hi', 'हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]'),
('kn', 'ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]'),
('si', 'ශ්‍රී ලංකාවේ ප්‍රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල‍ ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්‍රී ලංකාවේ නිල භාෂාවයි .'),
('ta', 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]'),
('te', 'ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.'),
('ur', 'اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔'),
],
# fmt: on
)
def test_sentencizer_across_scripts(lang, text):
nlp = spacy.blank(lang)
sentencizer = Sentencizer()
nlp.add_pipe(sentencizer)
doc = nlp(text)
assert len(list(doc.sents)) > 1

View File

@ -1,11 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import pytest
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.tokens import Doc
from spacy.compat import is_python2
from spacy.gold import GoldParse
from spacy.language import Language
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from .util import add_vecs_to_vocab, assert_docs_equal
@pytest.fixture
@ -58,3 +63,74 @@ def test_language_evaluate(nlp):
# Evaluate badly
with pytest.raises(Exception):
nlp.evaluate([text, gold])
def vector_modification_pipe(doc):
doc.vector += 1
return doc
def userdata_pipe(doc):
doc.user_data["foo"] = "bar"
return doc
def ner_pipe(doc):
span = Span(doc, 0, 1, label="FIRST")
doc.ents += (span,)
return doc
@pytest.fixture
def sample_vectors():
return [
("spacy", [-0.1, -0.2, -0.3]),
("world", [-0.2, -0.3, -0.4]),
("pipe", [0.7, 0.8, 0.9]),
]
@pytest.fixture
def nlp2(nlp, sample_vectors):
add_vecs_to_vocab(nlp.vocab, sample_vectors)
nlp.add_pipe(vector_modification_pipe)
nlp.add_pipe(ner_pipe)
nlp.add_pipe(userdata_pipe)
return nlp
@pytest.fixture
def texts():
data = [
"Hello world.",
"This is spacy.",
"You can use multiprocessing with pipe method.",
"Please try!",
]
return data
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe(nlp2, n_process, texts):
texts = texts * 10
expecteds = [nlp2(text) for text in texts]
docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
for doc, expected_doc in zip(docs, expecteds):
assert_docs_equal(doc, expected_doc)
@pytest.mark.skipif(
is_python2, reason="python2 seems to be unable to handle iterator properly"
)
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_stream(nlp2, n_process, texts):
# check if nlp.pipe can handle infinite length iterator properly.
stream_texts = itertools.cycle(texts)
texts0, texts1 = itertools.tee(stream_texts)
expecteds = (nlp2(text) for text in texts0)
docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
n_fetch = 20
for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
assert_docs_equal(doc, expected_doc)

View File

@ -0,0 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy import register_architecture
from spacy import get_architecture
from thinc.v2v import Affine
@register_architecture("my_test_function")
def create_model(nr_in, nr_out):
return Affine(nr_in, nr_out)
def test_get_architecture():
arch = get_architecture("my_test_function")
assert arch is create_model
with pytest.raises(KeyError):
get_architecture("not_an_existing_key")

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals, print_function
import os
import pkg_resources
import importlib
import re
from pathlib import Path
@ -28,15 +27,21 @@ except ImportError:
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
from .compat import import_file
from .compat import import_file, importlib_metadata
from .errors import Errors, Warnings, deprecation_warning
LANGUAGES = {}
ARCHITECTURES = {}
_data_path = Path(__file__).parent / "data"
_PRINT_ENV = False
# NB: Ony ever call this once! If called more than ince within the
# function, test_issue1506 hangs and it's not 100% clear why.
AVAILABLE_ENTRY_POINTS = importlib_metadata.entry_points()
class ENTRY_POINTS(object):
"""Available entry points to register extensions."""
@ -44,6 +49,7 @@ class ENTRY_POINTS(object):
languages = "spacy_languages"
displacy_colors = "spacy_displacy_colors"
lookups = "spacy_lookups"
architectures = "spacy_architectures"
def set_env_log(value):
@ -115,6 +121,44 @@ def set_data_path(path):
_data_path = ensure_path(path)
def register_architecture(name, arch=None):
"""Decorator to register an architecture. An architecture is a function
that returns a Thinc Model object.
name (unicode): The name of the architecture to register.
arch (Model): Optional architecture if function is called directly and
not used as a decorator.
RETURNS (callable): Function to register architecture.
"""
global ARCHITECTURES
if arch is not None:
ARCHITECTURES[name] = arch
return arch
def do_registration(arch):
ARCHITECTURES[name] = arch
return arch
return do_registration
def get_architecture(name):
"""Get a model architecture function by name. Raises a KeyError if the
architecture is not found.
name (unicode): The mame of the architecture.
RETURNS (Model): The architecture.
"""
# Check if an entry point is exposed for the architecture code
entry_point = get_entry_point(ENTRY_POINTS.architectures, name)
if entry_point is not None:
ARCHITECTURES[name] = entry_point
if name not in ARCHITECTURES:
names = ", ".join(sorted(ARCHITECTURES.keys()))
raise KeyError(Errors.E174.format(name=name, names=names))
return ARCHITECTURES[name]
def ensure_path(path):
"""Ensure string is converted to a Path.
@ -253,6 +297,8 @@ def is_package(name):
name (unicode): Name of package.
RETURNS (bool): True if installed package, False if not.
"""
import pkg_resources
name = name.lower() # compare package name against lowercase name
packages = pkg_resources.working_set.by_key.keys()
for package in packages:
@ -282,7 +328,7 @@ def get_entry_points(key):
RETURNS (dict): Entry points, keyed by name.
"""
result = {}
for entry_point in pkg_resources.iter_entry_points(key):
for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
result[entry_point.name] = entry_point.load()
return result
@ -296,7 +342,7 @@ def get_entry_point(key, value, default=None):
default: Optional default value to return.
RETURNS: The loaded entry point or None.
"""
for entry_point in pkg_resources.iter_entry_points(key):
for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
if entry_point.name == value:
return entry_point.load()
return default

View File

@ -337,7 +337,7 @@ cdef class Vectors:
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
if sort:
sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores, axis=1)[:,::-1]
sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index]

View File

@ -8,10 +8,10 @@
"en_core_web_md",
"en_core_web_lg",
"en_vectors_web_lg",
"en_pytt_bertbaseuncased_lg",
"en_pytt_robertabase_lg",
"en_pytt_distilbertbaseuncased_lg",
"en_pytt_xlnetbasecased_lg"
"en_trf_bertbaseuncased_lg",
"en_trf_robertabase_lg",
"en_trf_distilbertbaseuncased_lg",
"en_trf_xlnetbasecased_lg"
],
"example": "This is a sentence.",
"has_examples": true
@ -19,7 +19,7 @@
{
"code": "de",
"name": "German",
"models": ["de_core_news_sm", "de_core_news_md", "de_pytt_bertbasecased_lg"],
"models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"],
"example": "Dies ist ein Satz.",
"has_examples": true
},

View File

@ -1675,21 +1675,21 @@
}
},
{
"id": "spacy-pytorch-transformers",
"title": "spacy-pytorch-transformers",
"id": "spacy-transformers",
"title": "spacy-transformers",
"slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
"github": "explosion/spacy-pytorch-transformers",
"url": "https://explosion.ai/blog/spacy-pytorch-transformers",
"pip": "spacy-pytorch-transformers",
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `transformers`](https://github.com/huggingface/transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
"github": "explosion/spacy-transformers",
"url": "https://explosion.ai/blog/spacy-transformers",
"pip": "spacy-transformers",
"category": ["pipeline", "models", "research"],
"code_example": [
"import spacy",
"",
"nlp = spacy.load(\"en_pytt_bertbaseuncased_lg\")",
"nlp = spacy.load(\"en_trf_bertbaseuncased_lg\")",
"doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")",
"print(doc[0].similarity(doc[7]))",
"print(doc._.pytt_last_hidden_state.shape)"
"print(doc._.trf_last_hidden_state.shape)"
],
"author": "Explosion",
"author_links": {

View File

@ -23,6 +23,7 @@ const MODEL_META = {
dep: 'Vocabulary, syntax',
ent: 'Named entities',
pytt: 'PyTorch Transformers',
trf: 'Transformers',
vectors: 'Word vectors',
web: 'written text (blogs, news, comments)',
news: 'written text (news, media)',