From a3509f67d48d8ba9f7eb83201ef38de4165cd50f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 7 Oct 2019 13:17:03 +0200 Subject: [PATCH 01/11] Extend unicode character block for Sinhala (#4378) * Extend unicode character block for Sinhala * Add sentencizer tests for more languages --- spacy/lang/char_classes.py | 2 +- spacy/tests/pipeline/test_sentencizer.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index cb5b50ffc..5ed2a2a8c 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -184,7 +184,7 @@ _russian_lower = r"ёа-я" _russian_upper = r"ЁА-Я" _russian = r"ёа-яЁА-Я" -_sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6" +_sinhala = r"\u0D80-\u0DFF" _tatar_lower = r"әөүҗңһ" _tatar_upper = r"ӘӨҮҖҢҺ" diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 1e03dc743..d91fdd198 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import spacy from spacy.pipeline import Sentencizer from spacy.tokens import Doc @@ -85,3 +86,26 @@ def test_sentencizer_serialize_bytes(en_vocab): bytes_data = sentencizer.to_bytes() new_sentencizer = Sentencizer().from_bytes(bytes_data) assert new_sentencizer.punct_chars == set(punct_chars) + + +@pytest.mark.parametrize( + # fmt: off + "lang,text", + [ + ('bn', 'বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।'), + ('de', 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.'), + ('hi', 'हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]'), + ('kn', 'ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]'), + ('si', 'ශ්‍රී ලංකාවේ ප්‍රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල‍ ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්‍රී ලංකාවේ නිල භාෂාවයි .'), + ('ta', 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]'), + ('te', 'ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.'), + ('ur', 'اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔'), + ], + # fmt: on +) +def test_sentencizer_across_scripts(lang, text): + nlp = spacy.blank(lang) + sentencizer = Sentencizer() + nlp.add_pipe(sentencizer) + doc = nlp(text) + assert len(list(doc.sents)) > 1 From d53a8d9313099b0c9724e28ca276603274749313 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 7 Oct 2019 13:38:35 +0200 Subject: [PATCH 02/11] Consider batch_size when sorting similar vectors (#4388) --- spacy/vectors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 75716617c..881f01052 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -337,7 +337,7 @@ cdef class Vectors: scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:] if sort: - sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores, axis=1)[:,::-1] + sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1] scores[i:i+batch_size] = scores[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index] From 9cd6ca3e4d9074f373c3cac7f96b4bd725bc2730 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 7 Oct 2019 17:22:09 +0200 Subject: [PATCH 03/11] Improve usage of pkg_resources and handling of entry points (#4387) * Only import pkg_resources where it's needed Apparently it's really slow * Use importlib_metadata for entry points * Revert "Only import pkg_resources where it's needed" This reverts commit 5ed8c03afac098e6b676e1da44486b7f7dc30f73. * Revert "Revert "Only import pkg_resources where it's needed"" This reverts commit 8b30b579571d2af259f14d360877b7563065cf6d. * Revert "Use importlib_metadata for entry points" This reverts commit 9f071f5c405427666a8d77098d3444fb410a59f9. * Revert "Revert "Use importlib_metadata for entry points"" This reverts commit 02e12a17ece2c80bf2d878b45997d91efdc9257d. * Skip test that weirdly hangs * Fix hanging test by using global --- requirements.txt | 1 + setup.cfg | 1 + spacy/cli/download.py | 3 ++- spacy/cli/validate.py | 3 ++- spacy/compat.py | 5 +++++ spacy/util.py | 14 ++++++++++---- 6 files changed, 21 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 601b73559..0178c41ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac<1.0.0,>=0.9.6 pathlib==1.0.1; python_version < "3.4" +importlib_metadata>=0.23; python_version < "3.8" # Optional dependencies jsonschema>=2.6.0,<3.1.0 # Development dependencies diff --git a/setup.cfg b/setup.cfg index bcb85eef3..c626f9566 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,6 +50,7 @@ install_requires = wasabi>=0.2.0,<1.1.0 srsly>=0.1.0,<1.1.0 pathlib==1.0.1; python_version < "3.4" + importlib_metadata>=0.23; python_version < "3.8" [options.extras_require] lookups = diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 64ab03a75..c57e2364b 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -6,7 +6,6 @@ import requests import os import subprocess import sys -import pkg_resources from wasabi import Printer from .link import link @@ -87,6 +86,8 @@ def download(model, direct=False, *pip_args): def require_package(name): try: + import pkg_resources + pkg_resources.working_set.require(name) return True except: # noqa: E722 diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index f608ccd7f..38f8d2313 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals, print_function -import pkg_resources from pathlib import Path import sys import requests @@ -109,6 +108,8 @@ def get_model_links(compat): def get_model_pkgs(compat, all_models): + import pkg_resources + pkgs = {} for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): package = pkg_name.replace("-", "_") diff --git a/spacy/compat.py b/spacy/compat.py index 16b400ad7..3a19e9423 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -35,6 +35,11 @@ try: except ImportError: cupy = None +try: # Python 3.8 + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata # noqa: F401 + try: from thinc.neural.optimizers import Optimizer # noqa: F401 except ImportError: diff --git a/spacy/util.py b/spacy/util.py index c7ce38c3f..39cb73c05 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals, print_function import os -import pkg_resources import importlib import re from pathlib import Path @@ -28,7 +27,7 @@ except ImportError: from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, unicode_ -from .compat import import_file +from .compat import import_file, importlib_metadata from .errors import Errors, Warnings, deprecation_warning @@ -37,6 +36,11 @@ _data_path = Path(__file__).parent / "data" _PRINT_ENV = False +# NB: Ony ever call this once! If called more than ince within the +# function, test_issue1506 hangs and it's not 100% clear why. +AVAILABLE_ENTRY_POINTS = importlib_metadata.entry_points() + + class ENTRY_POINTS(object): """Available entry points to register extensions.""" @@ -253,6 +257,8 @@ def is_package(name): name (unicode): Name of package. RETURNS (bool): True if installed package, False if not. """ + import pkg_resources + name = name.lower() # compare package name against lowercase name packages = pkg_resources.working_set.by_key.keys() for package in packages: @@ -282,7 +288,7 @@ def get_entry_points(key): RETURNS (dict): Entry points, keyed by name. """ result = {} - for entry_point in pkg_resources.iter_entry_points(key): + for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []): result[entry_point.name] = entry_point.load() return result @@ -296,7 +302,7 @@ def get_entry_point(key, value, default=None): default: Optional default value to return. RETURNS: The loaded entry point or None. """ - for entry_point in pkg_resources.iter_entry_points(key): + for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []): if entry_point.name == value: return entry_point.load() return default From 29f9fec267d70a6059f6dd10b33c2bb762ba0e68 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Oct 2019 23:34:58 +0200 Subject: [PATCH 04/11] Improve spacy pretrain (#4393) * Support bilstm_depth arg in spacy pretrain * Add option to ignore zero vectors in get_cossim_loss * Use cosine loss in Cloze multitask --- spacy/_ml.py | 14 +++++++++++--- spacy/cli/pretrain.py | 8 +++++++- spacy/pipeline/pipes.pyx | 5 ++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 6104324ab..86dac6c7a 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -953,16 +953,24 @@ class CharacterEmbed(Model): return output, backprop_character_embed -def get_cossim_loss(yh, y): +def get_cossim_loss(yh, y, ignore_zeros=False): + xp = get_array_module(yh) + # Find the zero vectors + if ignore_zeros: + zero_indices = xp.abs(y).sum(axis=1) == 0 # Add a small constant to avoid 0 vectors yh = yh + 1e-8 y = y + 1e-8 # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity - xp = get_array_module(yh) norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) norm_y = xp.linalg.norm(y, axis=1, keepdims=True) mul_norms = norm_yh * norm_y cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2)) - loss = xp.abs(cosine - 1).sum() + losses = xp.abs(cosine - 1) + if ignore_zeros: + # If the target was a zero vector, don't count it in the loss. + d_yh[zero_indices] = 0 + losses[zero_indices] = 0 + loss = losses.sum() return loss, -d_yh diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 60f703d2f..891e15fa2 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -35,6 +35,7 @@ from .train import _load_pretrained_tok2vec output_dir=("Directory to write models to on each epoch", "positional", None, str), width=("Width of CNN layers", "option", "cw", int), depth=("Depth of CNN layers", "option", "cd", int), + bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int), embed_rows=("Number of embedding rows", "option", "er", int), loss_func=( "Loss function to use for the objective. Either 'L2' or 'cosine'", @@ -80,6 +81,7 @@ def pretrain( output_dir, width=96, depth=4, + bilstm_depth=2, embed_rows=2000, loss_func="cosine", use_vectors=False, @@ -116,6 +118,10 @@ def pretrain( util.fix_random_seed(seed) has_gpu = prefer_gpu() + if has_gpu: + import torch + + torch.set_default_tensor_type("torch.cuda.FloatTensor") msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) @@ -151,7 +157,7 @@ def pretrain( embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, - bilstm_depth=0, # Requires PyTorch. Experimental. + bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, # Set to False for Chinese etc ), diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 23509fcae..63ab09e56 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -29,7 +29,7 @@ from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss from .._ml import build_text_classifier, build_simple_cnn_text_classifier from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import link_vectors_to_models, zero_init, flatten -from .._ml import masked_language_model, create_default_optimizer +from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss from ..errors import Errors, TempErrors, user_warning, Warnings from .. import util @@ -880,8 +880,7 @@ class ClozeMultitask(Pipe): # and look them up all at once. This prevents data copying. ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = vectors[ids] - gradient = (prediction - target) / prediction.shape[0] - loss = (gradient**2).sum() + loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) return float(loss), gradient def update(self, docs, golds, drop=0., sgd=None, losses=None): From fd4a5341b0beffa126f1868eb9031cd581fd9515 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Oct 2019 00:52:45 +0200 Subject: [PATCH 05/11] Fix ner_jsonl2json converter (fix #4389) (#4394) --- spacy/cli/converters/jsonl2json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 91dd42982..1c1bc45c7 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -7,7 +7,7 @@ from ...gold import docs_to_json from ...util import get_lang_class, minibatch -def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False): +def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_): if lang is None: raise ValueError("No --lang specified, but tokenization required") json_docs = [] From 14841d0aa63ccc48605757e5121cbb4170e7c7b2 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 8 Oct 2019 12:07:02 +0200 Subject: [PATCH 06/11] Fix PhraseMatcher callback and add tests (#4399) * Fix callback lookup in PhraseMatcher (string key rather than hash key) * Add callback tests for Matcher and PhraseMatcher --- spacy/matcher/phrasematcher.pyx | 2 +- spacy/tests/matcher/test_matcher_api.py | 11 +++++++++++ spacy/tests/matcher/test_phrase_matcher.py | 11 +++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index b6c9e01d2..33b24c129 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -225,7 +225,7 @@ cdef class PhraseMatcher: for i in range(c_matches.size()): matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end)) for i, (ent_id, start, end) in enumerate(matches): - on_match = self._callbacks.get(ent_id) + on_match = self._callbacks.get(self.vocab.strings[ent_id]) if on_match is not None: on_match(self, doc, i, matches) return matches diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 0d640e1a2..730756524 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import pytest import re +from mock import Mock from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token @@ -418,3 +419,13 @@ def test_matcher_valid_callback(en_vocab): with pytest.raises(ValueError): matcher.add("TEST", [], [{"TEXT": "test"}]) matcher(Doc(en_vocab, words=["test"])) + + +def test_matcher_callback(en_vocab): + mock = Mock() + matcher = Matcher(en_vocab) + pattern = [{"ORTH": "test"}] + matcher.add("Rule", mock, pattern) + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + matches = matcher(doc) + mock.assert_called_once_with(matcher, doc, 0, matches) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 486cbb984..ad00e2323 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +from mock import Mock from spacy.matcher import PhraseMatcher from spacy.tokens import Doc from ..util import get_doc @@ -215,3 +216,13 @@ def test_attr_pipeline_checks(en_vocab): matcher.add("TEST3", None, doc3) matcher = PhraseMatcher(en_vocab, attr="TEXT") matcher.add("TEST3", None, doc3) + + +def test_phrase_matcher_callback(en_vocab): + mock = Mock() + doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) + pattern = Doc(en_vocab, words=["Google", "Now"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("COMPANY", mock, pattern) + matches = matcher(doc) + mock.assert_called_once_with(matcher, doc, 0, matches) From 650cbfe82dac25136ba593006961fddb09773803 Mon Sep 17 00:00:00 2001 From: tamuhey Date: Tue, 8 Oct 2019 19:20:55 +0900 Subject: [PATCH 07/11] multiprocessing pipe (#1303) (#4371) * refactor: separate formatting docs and golds in Language.update * fix return typo * add pipe test * unpickleable object cannot be assigned to p.map * passed test pipe * passed test! * pipe terminate * try pipe * passed test * fix ch * add comments * fix len(texts) * add comment * add comment * fix: multiprocessing of pipe is not supported in 2 * test: use assert_docs_equal * fix: is_python3 -> is_python2 * fix: change _pipe arg to use functools.partial * test: add vector modification test * test: add sample ner_pipe and user_data pipe * add warnings test * test: fix user warnings * test: fix warnings capture * fix: remove islice import * test: remove warnings test * test: add stream test * test: rename * fix: multiproc stream * fix: stream pipe * add comment * mp.Pipe seems to be able to use with relative small data * test: skip stream test in python2 * sort imports * test: add reason to skiptest * fix: use pipe for docs communucation * add comments * add comment --- spacy/errors.py | 1 + spacy/language.py | 125 +++++++++++++++++++++++++++++++++-- spacy/tests/test_language.py | 82 ++++++++++++++++++++++- 3 files changed, 198 insertions(+), 10 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index ecebc8345..a4b16f6fa 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -95,6 +95,7 @@ class Warnings(object): "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. " "If this is surprising, make sure you have the spacy-lookups-data " "package installed.") + W023 = ("Multiprocessing of Language.pipe is not supported in Python2. 'n_process' will be set to 1.") @add_codes diff --git a/spacy/language.py b/spacy/language.py index b2a81fc60..dc229c2ba 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,8 +1,11 @@ # coding: utf8 from __future__ import absolute_import, unicode_literals +import atexit import random import itertools +from warnings import warn +from spacy.util import minibatch import weakref import functools from collections import OrderedDict @@ -10,6 +13,8 @@ from contextlib import contextmanager from copy import copy, deepcopy from thinc.neural import Model import srsly +import multiprocessing as mp +from itertools import chain, cycle from .tokenizer import Tokenizer from .vocab import Vocab @@ -21,7 +26,7 @@ from .pipeline import SimilarityHook, TextCategorizer, Sentencizer from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import EntityRuler from .pipeline import Morphologizer -from .compat import izip, basestring_ +from .compat import izip, basestring_, is_python2 from .gold import GoldParse from .scorer import Scorer from ._ml import link_vectors_to_models, create_default_optimizer @@ -30,8 +35,9 @@ from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP +from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings, deprecation_warning, user_warning from . import util from . import about @@ -733,6 +739,7 @@ class Language(object): disable=[], cleanup=False, component_cfg=None, + n_process=1, ): """Process texts as a stream, and yield `Doc` objects in order. @@ -746,12 +753,20 @@ class Language(object): use. Experimental. component_cfg (dict): An optional dictionary with extra keyword arguments for specific components. + n_process (int): Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. YIELDS (Doc): Documents in the order of the original text. DOCS: https://spacy.io/api/language#pipe """ + # raw_texts will be used later to stop iterator. + texts, raw_texts = itertools.tee(texts) + if is_python2 and n_process != 1: + user_warning(Warnings.W023) + n_process = 1 if n_threads != -1: deprecation_warning(Warnings.W016) + if n_process == -1: + n_process = mp.cpu_count() if as_tuples: text_context1, text_context2 = itertools.tee(texts) texts = (tc[0] for tc in text_context1) @@ -765,9 +780,12 @@ class Language(object): for doc, context in izip(docs, contexts): yield (doc, context) return - docs = (self.make_doc(text) for text in texts) if component_cfg is None: component_cfg = {} + + pipes = ( + [] + ) # contains functools.partial objects so that easily create multiprocess worker. for name, proc in self.pipeline: if name in disable: continue @@ -775,10 +793,20 @@ class Language(object): # Allow component_cfg to overwrite the top-level kwargs. kwargs.setdefault("batch_size", batch_size) if hasattr(proc, "pipe"): - docs = proc.pipe(docs, **kwargs) + f = functools.partial(proc.pipe, **kwargs) else: # Apply the function, but yield the doc - docs = _pipe(proc, docs, kwargs) + f = functools.partial(_pipe, proc=proc, kwargs=kwargs) + pipes.append(f) + + if n_process != 1: + docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size) + else: + # if n_process == 1, no processes are forked. + docs = (self.make_doc(text) for text in texts) + for pipe in pipes: + docs = pipe(docs) + # Track weakrefs of "recent" documents, so that we can see when they # expire from memory. When they do, we know we don't need old strings. # This way, we avoid maintaining an unbounded growth in string entries @@ -809,6 +837,46 @@ class Language(object): self.tokenizer._reset_cache(keys) nr_seen = 0 + def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size): + # raw_texts is used later to stop iteration. + texts, raw_texts = itertools.tee(texts) + # for sending texts to worker + texts_q = [mp.Queue() for _ in range(n_process)] + # for receiving byte encoded docs from worker + bytedocs_recv_ch, bytedocs_send_ch = zip( + *[mp.Pipe(False) for _ in range(n_process)] + ) + + batch_texts = minibatch(texts, batch_size) + # Sender sends texts to the workers. + # This is necessary to properly handle infinite length of texts. + # (In this case, all data cannot be sent to the workers at once) + sender = _Sender(batch_texts, texts_q, chunk_size=n_process) + # send twice so that make process busy + sender.send() + sender.send() + + procs = [ + mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch)) + for rch, sch in zip(texts_q, bytedocs_send_ch) + ] + for proc in procs: + proc.start() + + # Cycle channels not to break the order of docs. + # The received object is batch of byte encoded docs, so flatten them with chain.from_iterable. + byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch)) + docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs) + try: + for i, (_, doc) in enumerate(zip(raw_texts, docs), 1): + yield doc + if i % batch_size == 0: + # tell `sender` that one batch was consumed. + sender.step() + finally: + for proc in procs: + proc.terminate() + def to_disk(self, path, exclude=tuple(), disable=None): """Save the current state to a directory. If a model is loaded, this will include the model. @@ -987,12 +1055,55 @@ class DisabledPipes(list): self[:] = [] -def _pipe(func, docs, kwargs): +def _pipe(docs, proc, kwargs): # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) for arg in ["n_threads", "batch_size"]: if arg in kwargs: kwargs.pop(arg) for doc in docs: - doc = func(doc, **kwargs) + doc = proc(doc, **kwargs) yield doc + + +def _apply_pipes(make_doc, pipes, reciever, sender): + """Worker for Language.pipe + + Args: + receiver (multiprocessing.Connection): Pipe to receive text. Usually created by `multiprocessing.Pipe()` + sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()` + """ + while True: + texts = reciever.get() + docs = (make_doc(text) for text in texts) + for pipe in pipes: + docs = pipe(docs) + # Connection does not accept unpickable objects, so send list. + sender.send([doc.to_bytes() for doc in docs]) + + +class _Sender: + """Util for sending data to multiprocessing workers in Language.pipe""" + + def __init__(self, data, queues, chunk_size): + self.data = iter(data) + self.queues = iter(cycle(queues)) + self.chunk_size = chunk_size + self.count = 0 + + def send(self): + """Send chunk_size items from self.data to channels.""" + for item, q in itertools.islice( + zip(self.data, cycle(self.queues)), self.chunk_size + ): + # cycle channels so that distribute the texts evenly + q.put(item) + + def step(self): + """Tell sender that comsumed one item. + + Data is sent to the workers after every chunk_size calls.""" + self.count += 1 + if self.count >= self.chunk_size: + self.count = 0 + self.send() diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 94c37d4ab..d5398c145 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,11 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools + import pytest -from spacy.vocab import Vocab -from spacy.language import Language -from spacy.tokens import Doc +from spacy.compat import is_python2 from spacy.gold import GoldParse +from spacy.language import Language +from spacy.tokens import Doc, Span +from spacy.vocab import Vocab + +from .util import add_vecs_to_vocab, assert_docs_equal @pytest.fixture @@ -58,3 +63,74 @@ def test_language_evaluate(nlp): # Evaluate badly with pytest.raises(Exception): nlp.evaluate([text, gold]) + + +def vector_modification_pipe(doc): + doc.vector += 1 + return doc + + +def userdata_pipe(doc): + doc.user_data["foo"] = "bar" + return doc + + +def ner_pipe(doc): + span = Span(doc, 0, 1, label="FIRST") + doc.ents += (span,) + return doc + + +@pytest.fixture +def sample_vectors(): + return [ + ("spacy", [-0.1, -0.2, -0.3]), + ("world", [-0.2, -0.3, -0.4]), + ("pipe", [0.7, 0.8, 0.9]), + ] + + +@pytest.fixture +def nlp2(nlp, sample_vectors): + add_vecs_to_vocab(nlp.vocab, sample_vectors) + nlp.add_pipe(vector_modification_pipe) + nlp.add_pipe(ner_pipe) + nlp.add_pipe(userdata_pipe) + return nlp + + +@pytest.fixture +def texts(): + data = [ + "Hello world.", + "This is spacy.", + "You can use multiprocessing with pipe method.", + "Please try!", + ] + return data + + +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe(nlp2, n_process, texts): + texts = texts * 10 + expecteds = [nlp2(text) for text in texts] + docs = nlp2.pipe(texts, n_process=n_process, batch_size=2) + + for doc, expected_doc in zip(docs, expecteds): + assert_docs_equal(doc, expected_doc) + + +@pytest.mark.skipif( + is_python2, reason="python2 seems to be unable to handle iterator properly" +) +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_stream(nlp2, n_process, texts): + # check if nlp.pipe can handle infinite length iterator properly. + stream_texts = itertools.cycle(texts) + texts0, texts1 = itertools.tee(stream_texts) + expecteds = (nlp2(text) for text in texts0) + docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2) + + n_fetch = 20 + for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch): + assert_docs_equal(doc, expected_doc) From ddd6fda59cb5499729c936400998e0137c995bf1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Oct 2019 12:21:03 +0200 Subject: [PATCH 08/11] Add registry for model creation functions ('architectures') (#4395) * Add architecture registry * Add test for arch registry * Add error for model architectures --- spacy/__init__.py | 1 + spacy/errors.py | 1 + spacy/tests/test_register_architecture.py | 19 ++++++++++++++ spacy/util.py | 30 +++++++++++++++++++++++ 4 files changed, 51 insertions(+) create mode 100644 spacy/tests/test_register_architecture.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 9edbab198..8930b1d4e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -14,6 +14,7 @@ from .glossary import explain from .about import __version__ from .errors import Errors, Warnings, deprecation_warning from . import util +from .util import register_architecture, get_architecture if sys.maxunicode == 65535: diff --git a/spacy/errors.py b/spacy/errors.py index a4b16f6fa..de93eaf2e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -496,6 +496,7 @@ class Errors(object): E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of " "Lookups containing the lemmatization tables. See the docs for " "details: https://spacy.io/api/lemmatizer#init") + E174 = ("Architecture {name} not found in registry. Available names: {names}") @add_codes diff --git a/spacy/tests/test_register_architecture.py b/spacy/tests/test_register_architecture.py new file mode 100644 index 000000000..0c1b5b16f --- /dev/null +++ b/spacy/tests/test_register_architecture.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy import register_architecture +from spacy import get_architecture +from thinc.v2v import Affine + + +@register_architecture("my_test_function") +def create_model(nr_in, nr_out): + return Affine(nr_in, nr_out) + + +def test_get_architecture(): + arch = get_architecture("my_test_function") + assert arch is create_model + with pytest.raises(KeyError): + get_architecture("not_an_existing_key") diff --git a/spacy/util.py b/spacy/util.py index 39cb73c05..d56f39a78 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -32,6 +32,7 @@ from .errors import Errors, Warnings, deprecation_warning LANGUAGES = {} +ARCHITECTURES = {} _data_path = Path(__file__).parent / "data" _PRINT_ENV = False @@ -48,6 +49,7 @@ class ENTRY_POINTS(object): languages = "spacy_languages" displacy_colors = "spacy_displacy_colors" lookups = "spacy_lookups" + architectures = "spacy_architectures" def set_env_log(value): @@ -119,6 +121,34 @@ def set_data_path(path): _data_path = ensure_path(path) +def register_architecture(name, arch=None): + """Decorator to register an architecture. An architecture is a function + that returns a Thinc Model object. + """ + global ARCHITECTURES + if arch is not None: + ARCHITECTURES[name] = arch + return arch + + def do_registration(arch): + ARCHITECTURES[name] = arch + return arch + + return do_registration + + +def get_architecture(name): + """Get a model architecture function by name.""" + # Check if an entry point is exposed for the architecture code + entry_point = get_entry_point(ENTRY_POINTS.architectures, name) + if entry_point is not None: + ARCHITECTURES[name] = entry_point + if name not in ARCHITECTURES: + names = ", ".join(sorted(ARCHITECTURES.keys())) + raise KeyError(Errors.E174.format(name=name, names=names)) + return ARCHITECTURES[name] + + def ensure_path(path): """Ensure string is converted to a Path. From c4f95c156994175728402e4cdb2752d975a0e42d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 8 Oct 2019 12:25:23 +0200 Subject: [PATCH 09/11] Update formatting and docstrings [ci skip] --- spacy/errors.py | 3 ++- spacy/util.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index de93eaf2e..5d4d4298e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -496,7 +496,8 @@ class Errors(object): E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of " "Lookups containing the lemmatization tables. See the docs for " "details: https://spacy.io/api/lemmatizer#init") - E174 = ("Architecture {name} not found in registry. Available names: {names}") + E174 = ("Architecture '{name}' not found in registry. Available " + "names: {names}") @add_codes diff --git a/spacy/util.py b/spacy/util.py index d56f39a78..9798ff11b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -124,6 +124,11 @@ def set_data_path(path): def register_architecture(name, arch=None): """Decorator to register an architecture. An architecture is a function that returns a Thinc Model object. + + name (unicode): The name of the architecture to register. + arch (Model): Optional architecture if function is called directly and + not used as a decorator. + RETURNS (callable): Function to register architecture. """ global ARCHITECTURES if arch is not None: @@ -138,7 +143,12 @@ def register_architecture(name, arch=None): def get_architecture(name): - """Get a model architecture function by name.""" + """Get a model architecture function by name. Raises a KeyError if the + architecture is not found. + + name (unicode): The mame of the architecture. + RETURNS (Model): The architecture. + """ # Check if an entry point is exposed for the architecture code entry_point = get_entry_point(ENTRY_POINTS.architectures, name) if entry_point is not None: From dd30d3ec998bfc70084b6dc096972343e3ea38fd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 8 Oct 2019 12:46:59 +0200 Subject: [PATCH 10/11] Add setuptools as runtime dependency --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index c626f9566..f3737b48e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,6 +39,7 @@ setup_requires = murmurhash>=0.28.0,<1.1.0 thinc>=7.1.1,<7.2.0 install_requires = + setuptools numpy>=1.15.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 From 8f76d6c9ef53f17e496cdbae2a1258647178f8b1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 8 Oct 2019 15:39:38 +0200 Subject: [PATCH 11/11] Update transformer model details [ci skip] --- netlify.toml | 2 ++ website/meta/languages.json | 10 +++++----- website/meta/universe.json | 16 ++++++++-------- website/src/templates/models.js | 1 + 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/netlify.toml b/netlify.toml index c116eb49b..45bd2c3b6 100644 --- a/netlify.toml +++ b/netlify.toml @@ -48,4 +48,6 @@ redirects = [ {from = "/api/sentencesegmenter", to="/api/sentencizer"}, {from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true}, {from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true}, + # Renamed universe projects + {from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true} ] diff --git a/website/meta/languages.json b/website/meta/languages.json index 09a17b568..364b2ef6a 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -8,10 +8,10 @@ "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg", - "en_pytt_bertbaseuncased_lg", - "en_pytt_robertabase_lg", - "en_pytt_distilbertbaseuncased_lg", - "en_pytt_xlnetbasecased_lg" + "en_trf_bertbaseuncased_lg", + "en_trf_robertabase_lg", + "en_trf_distilbertbaseuncased_lg", + "en_trf_xlnetbasecased_lg" ], "example": "This is a sentence.", "has_examples": true @@ -19,7 +19,7 @@ { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md", "de_pytt_bertbasecased_lg"], + "models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"], "example": "Dies ist ein Satz.", "has_examples": true }, diff --git a/website/meta/universe.json b/website/meta/universe.json index d30b77ca4..47fe27cc4 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1675,21 +1675,21 @@ } }, { - "id": "spacy-pytorch-transformers", - "title": "spacy-pytorch-transformers", + "id": "spacy-transformers", + "title": "spacy-transformers", "slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2", - "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.", - "github": "explosion/spacy-pytorch-transformers", - "url": "https://explosion.ai/blog/spacy-pytorch-transformers", - "pip": "spacy-pytorch-transformers", + "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `transformers`](https://github.com/huggingface/transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.", + "github": "explosion/spacy-transformers", + "url": "https://explosion.ai/blog/spacy-transformers", + "pip": "spacy-transformers", "category": ["pipeline", "models", "research"], "code_example": [ "import spacy", "", - "nlp = spacy.load(\"en_pytt_bertbaseuncased_lg\")", + "nlp = spacy.load(\"en_trf_bertbaseuncased_lg\")", "doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")", "print(doc[0].similarity(doc[7]))", - "print(doc._.pytt_last_hidden_state.shape)" + "print(doc._.trf_last_hidden_state.shape)" ], "author": "Explosion", "author_links": { diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 3ab701727..82c7e4219 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -23,6 +23,7 @@ const MODEL_META = { dep: 'Vocabulary, syntax', ent: 'Named entities', pytt: 'PyTorch Transformers', + trf: 'Transformers', vectors: 'Word vectors', web: 'written text (blogs, news, comments)', news: 'written text (news, media)',