From 66a4834e56389a915cf70d5089f0c2b756467b6a Mon Sep 17 00:00:00 2001 From: Karen Hambardzumyan Date: Mon, 22 Jun 2020 10:50:34 +0400 Subject: [PATCH 01/43] Some changes for Armenian (#5616) * Fixing numericals * We need a Armenian question sign to make the sentence a question --- spacy/lang/hy/examples.py | 2 +- spacy/lang/hy/lex_attrs.py | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index 323f77b1c..8a00fd243 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -11,6 +11,6 @@ Example sentences to test spaCy and its language models. sentences = [ "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։", "Ո՞վ է Ֆրանսիայի նախագահը։", - "Որն է Միացյալ Նահանգների մայրաքաղաքը։", + "Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։", "Ե՞րբ է ծնվել Բարաք Օբաման։", ] diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index b556d679c..dea3c0e97 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -18,14 +18,15 @@ _num_words = [ "տասը", "տասնմեկ", "տասներկու", - "տասն­երեք", - "տասն­չորս", - "տասն­հինգ", - "տասն­վեց", - "տասն­յոթ", - "տասն­ութ", - "տասն­ինը", - "քսան" "երեսուն", + "տասներեք", + "տասնչորս", + "տասնհինգ", + "տասնվեց", + "տասնյոթ", + "տասնութ", + "տասնինը", + "քսան", + "երեսուն", "քառասուն", "հիսուն", "վաթսուն", From c34420794acd4e3b656332c430a41252d33a9722 Mon Sep 17 00:00:00 2001 From: Rameshh <30867740+rameshhpathak@users.noreply.github.com> Date: Mon, 22 Jun 2020 14:10:46 +0545 Subject: [PATCH 02/43] Add Nepali Language (#5622) * added support for nepali lang * added examples and test files * added spacy contributor agreement --- .github/contributors/rameshhpathak.md | 106 ++++++ spacy/lang/ne/__init__.py | 23 ++ spacy/lang/ne/examples.py | 22 ++ spacy/lang/ne/lex_attrs.py | 98 +++++ spacy/lang/ne/stop_words.py | 498 ++++++++++++++++++++++++++ spacy/tests/conftest.py | 5 + spacy/tests/lang/ne/__init__.py | 0 spacy/tests/lang/ne/test_text.py | 19 + 8 files changed, 771 insertions(+) create mode 100644 .github/contributors/rameshhpathak.md create mode 100644 spacy/lang/ne/__init__.py create mode 100644 spacy/lang/ne/examples.py create mode 100644 spacy/lang/ne/lex_attrs.py create mode 100644 spacy/lang/ne/stop_words.py create mode 100644 spacy/tests/lang/ne/__init__.py create mode 100644 spacy/tests/lang/ne/test_text.py diff --git a/.github/contributors/rameshhpathak.md b/.github/contributors/rameshhpathak.md new file mode 100644 index 000000000..30a543307 --- /dev/null +++ b/.github/contributors/rameshhpathak.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ramesh Pathak | +| Company name (if applicable) | Diyo AI | +| Title or role (if applicable) | AI Engineer | +| Date | June 21, 2020 | +| GitHub username | rameshhpathak | +| Website (optional) |rameshhpathak.github.io| | diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py new file mode 100644 index 000000000..21556277d --- /dev/null +++ b/spacy/lang/ne/__init__.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS + +from ...language import Language +from ...attrs import LANG + + +class NepaliDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code + stop_words = STOP_WORDS + + +class Nepali(Language): + lang = "ne" + Defaults = NepaliDefaults + + +__all__ = ["Nepali"] diff --git a/spacy/lang/ne/examples.py b/spacy/lang/ne/examples.py new file mode 100644 index 000000000..b3c4f9e73 --- /dev/null +++ b/spacy/lang/ne/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ne.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ", + "स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्", + "स्यान फ्रांसिस्कोले फुटपाथ वितरण रोबोटहरु प्रतिबंध गर्ने विचार गर्दै छ", + "लन्डन यूनाइटेड किंगडमको एक ठूलो शहर हो।", + "तिमी कहाँ छौ?", + "फ्रान्स को राष्ट्रपति को हो?", + "संयुक्त राज्यको राजधानी के हो?", + "बराक ओबामा कहिले कहिले जन्मेका हुन्?", +] diff --git a/spacy/lang/ne/lex_attrs.py b/spacy/lang/ne/lex_attrs.py new file mode 100644 index 000000000..652307577 --- /dev/null +++ b/spacy/lang/ne/lex_attrs.py @@ -0,0 +1,98 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..norm_exceptions import BASE_NORMS +from ...attrs import NORM, LIKE_NUM + + +# fmt: off +_stem_suffixes = [ + ["ा", "ि", "ी", "ु", "ू", "ृ", "े", "ै", "ो", "ौ"], + ["ँ", "ं", "्", "ः"], + ["लाई", "ले", "बाट", "को", "मा", "हरू"], + ["हरूलाई", "हरूले", "हरूबाट", "हरूको", "हरूमा"], + ["इलो", "िलो", "नु", "ाउनु", "ई", "इन", "इन्", "इनन्"], + ["एँ", "इँन्", "इस्", "इनस्", "यो", "एन", "यौं", "एनौं", "ए", "एनन्"], + ["छु", "छौँ", "छस्", "छौ", "छ", "छन्", "छेस्", "छे", "छ्यौ", "छिन्", "हुन्छ"], + ["दै", "दिन", "दिँन", "दैनस्", "दैन", "दैनौँ", "दैनौं", "दैनन्"], + ["हुन्न", "न्न", "न्न्स्", "न्नौं", "न्नौ", "न्न्न्", "िई"], + ["अ", "ओ", "ऊ", "अरी", "साथ", "वित्तिकै", "पूर्वक"], + ["याइ", "ाइ", "बार", "वार", "चाँहि"], + ["ने", "ेको", "ेकी", "ेका", "ेर", "दै", "तै", "िकन", "उ", "न", "नन्"] +] +# fmt: on + +# reference 1: https://en.wikipedia.org/wiki/Numbers_in_Nepali_language +# reference 2: https://www.imnepal.com/nepali-numbers/ +_num_words = [ + "शुन्य", + "एक", + "दुई", + "तीन", + "चार", + "पाँच", + "छ", + "सात", + "आठ", + "नौ", + "दश", + "एघार", + "बाह्र", + "तेह्र", + "चौध", + "पन्ध्र", + "सोह्र", + "सोह्र", + "सत्र", + "अठार", + "उन्नाइस", + "बीस", + "तीस", + "चालीस", + "पचास", + "साठी", + "सत्तरी", + "असी", + "नब्बे", + "सय", + "हजार", + "लाख", + "करोड", + "अर्ब", + "खर्ब", +] + + +def norm(string): + # normalise base exceptions, e.g. punctuation or currency symbols + if string in BASE_NORMS: + return BASE_NORMS[string] + # set stem word as norm, if available, adapted from: + # https://github.com/explosion/spaCy/blob/master/spacy/lang/hi/lex_attrs.py + # https://www.researchgate.net/publication/237261579_Structure_of_Nepali_Grammar + for suffix_group in reversed(_stem_suffixes): + length = len(suffix_group[0]) + if len(string) <= length: + break + for suffix in suffix_group: + if string.endswith(suffix): + return string[:-length] + return string + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(", ", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {NORM: norm, LIKE_NUM: like_num} diff --git a/spacy/lang/ne/stop_words.py b/spacy/lang/ne/stop_words.py new file mode 100644 index 000000000..f008697d0 --- /dev/null +++ b/spacy/lang/ne/stop_words.py @@ -0,0 +1,498 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt + +STOP_WORDS = set( + """ +अक्सर +अगाडि +अगाडी +अघि +अझै +अठार +अथवा +अनि +अनुसार +अन्तर्गत +अन्य +अन्यत्र +अन्यथा +अब +अरु +अरुलाई +अरू +अर्को +अर्थात +अर्थात् +अलग +अलि +अवस्था +अहिले +आए +आएका +आएको +आज +आजको +आठ +आत्म +आदि +आदिलाई +आफनो +आफू +आफूलाई +आफै +आफैँ +आफ्नै +आफ्नो +आयो +उ +उक्त +उदाहरण +उनको +उनलाई +उनले +उनि +उनी +उनीहरुको +उन्नाइस +उप +उसको +उसलाई +उसले +उहालाई +ऊ +एउटा +एउटै +एक +एकदम +एघार +ओठ +औ +औं +कता +कति +कतै +कम +कमसेकम +कसरि +कसरी +कसै +कसैको +कसैलाई +कसैले +कसैसँग +कस्तो +कहाँबाट +कहिलेकाहीं +का +काम +कारण +कि +किन +किनभने +कुन +कुनै +कुन्नी +कुरा +कृपया +के +केहि +केही +को +कोहि +कोहिपनि +कोही +कोहीपनि +क्रमशः +गए +गएको +गएर +गयौ +गरि +गरी +गरे +गरेका +गरेको +गरेर +गरौं +गर्छ +गर्छन् +गर्छु +गर्दा +गर्दै +गर्न +गर्नु +गर्नुपर्छ +गर्ने +गैर +घर +चार +चाले +चाहनुहुन्छ +चाहन्छु +चाहिं +चाहिए +चाहिंले +चाहीं +चाहेको +चाहेर +चोटी +चौथो +चौध +छ +छन +छन् +छु +छू +छैन +छैनन् +छौ +छौं +जता +जताततै +जना +जनाको +जनालाई +जनाले +जब +जबकि +जबकी +जसको +जसबाट +जसमा +जसरी +जसलाई +जसले +जस्ता +जस्तै +जस्तो +जस्तोसुकै +जहाँ +जान +जाने +जाहिर +जुन +जुनै +जे +जो +जोपनि +जोपनी +झैं +ठाउँमा +ठीक +ठूलो +त +तता +तत्काल +तथा +तथापि +तथापी +तदनुसार +तपाइ +तपाई +तपाईको +तब +तर +तर्फ +तल +तसरी +तापनि +तापनी +तिन +तिनि +तिनिहरुलाई +तिनी +तिनीहरु +तिनीहरुको +तिनीहरू +तिनीहरूको +तिनै +तिमी +तिर +तिरको +ती +तीन +तुरन्त +तुरुन्त +तुरुन्तै +तेश्रो +तेस्कारण +तेस्रो +तेह्र +तैपनि +तैपनी +त्यत्तिकै +त्यत्तिकैमा +त्यस +त्यसकारण +त्यसको +त्यसले +त्यसैले +त्यसो +त्यस्तै +त्यस्तो +त्यहाँ +त्यहिँ +त्यही +त्यहीँ +त्यहीं +त्यो +त्सपछि +त्सैले +थप +थरि +थरी +थाहा +थिए +थिएँ +थिएन +थियो +दर्ता +दश +दिए +दिएको +दिन +दिनुभएको +दिनुहुन्छ +दुइ +दुइवटा +दुई +देखि +देखिन्छ +देखियो +देखे +देखेको +देखेर +दोश्री +दोश्रो +दोस्रो +द्वारा +धन्न +धेरै +धौ +न +नगर्नु +नगर्नू +नजिकै +नत्र +नत्रभने +नभई +नभएको +नभनेर +नयाँ +नि +निकै +निम्ति +निम्न +निम्नानुसार +निर्दिष्ट +नै +नौ +पक्का +पक्कै +पछाडि +पछाडी +पछि +पछिल्लो +पछी +पटक +पनि +पन्ध्र +पर्छ +पर्थ्यो +पर्दैन +पर्ने +पर्नेमा +पर्याप्त +पहिले +पहिलो +पहिल्यै +पाँच +पांच +पाचौँ +पाँचौं +पिच्छे +पूर्व +पो +प्रति +प्रतेक +प्रत्यक +प्राय +प्लस +फरक +फेरि +फेरी +बढी +बताए +बने +बरु +बाट +बारे +बाहिर +बाहेक +बाह्र +बिच +बिचमा +बिरुद्ध +बिशेष +बिस +बीच +बीचमा +बीस +भए +भएँ +भएका +भएकालाई +भएको +भएन +भएर +भन +भने +भनेको +भनेर +भन् +भन्छन् +भन्छु +भन्दा +भन्दै +भन्नुभयो +भन्ने +भन्या +भयेन +भयो +भर +भरि +भरी +भा +भित्र +भित्री +भीत्र +म +मध्य +मध्ये +मलाई +मा +मात्र +मात्रै +माथि +माथी +मुख्य +मुनि +मुन्तिर +मेरो +मैले +यति +यथोचित +यदि +यद्ध्यपि +यद्यपि +यस +यसका +यसको +यसपछि +यसबाहेक +यसमा +यसरी +यसले +यसो +यस्तै +यस्तो +यहाँ +यहाँसम्म +यही +या +यी +यो +र +रही +रहेका +रहेको +रहेछ +राखे +राख्छ +राम्रो +रुपमा +रूप +रे +लगभग +लगायत +लाई +लाख +लागि +लागेको +ले +वटा +वरीपरी +वा +वाट +वापत +वास्तवमा +शायद +सक्छ +सक्ने +सँग +संग +सँगको +सँगसँगै +सँगै +संगै +सङ्ग +सङ्गको +सट्टा +सत्र +सधै +सबै +सबैको +सबैलाई +समय +समेत +सम्भव +सम्म +सय +सरह +सहित +सहितै +सही +साँच्चै +सात +साथ +साथै +सायद +सारा +सुनेको +सुनेर +सुरु +सुरुको +सुरुमै +सो +सोचेको +सोचेर +सोही +सोह्र +स्थित +स्पष्ट +हजार +हरे +हरेक +हामी +हामीले +हाम्रा +हाम्रो +हुँदैन +हुन +हुनत +हुनु +हुने +हुनेछ +हुन् +हुन्छ +हुन्थ्यो +हैन +हो +होइन +होकि +होला +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 1f13da5d6..91b7e4d9d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -170,6 +170,11 @@ def nb_tokenizer(): return get_lang_class("nb").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ne_tokenizer(): + return get_lang_class("ne").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def nl_tokenizer(): return get_lang_class("nl").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ne/__init__.py b/spacy/tests/lang/ne/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ne/test_text.py b/spacy/tests/lang/ne/test_text.py new file mode 100644 index 000000000..926a7de04 --- /dev/null +++ b/spacy/tests/lang/ne/test_text.py @@ -0,0 +1,19 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_ne_tokenizer_handlers_long_text(ne_tokenizer): + text = """मैले पाएको सर्टिफिकेटलाई म त बोक्रो सम्झन्छु र अभ्यास तब सुरु भयो, जब मैले कलेज पार गरेँ र जीवनको पढाइ सुरु गरेँ ।""" + tokens = ne_tokenizer(text) + assert len(tokens) == 24 + + +@pytest.mark.parametrize( + "text,length", + [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)], +) +def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length): + tokens = ne_tokenizer(text) + assert len(tokens) == length \ No newline at end of file From 150a39ccca2426fcd10638c8515d7ec98cb79d8f Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Date: Mon, 22 Jun 2020 21:32:25 +0900 Subject: [PATCH 03/43] Japanese model: add user_dict entries and small refactor (#5573) * user_dict fields: adding inflections, reading_forms, sub_tokens deleting: unidic_tags improve code readability around the token alignment procedure * add test cases, replace fugashi with sudachipy in conftest * move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer * tag is space -> both surface and tag are spaces * consider len(text)==0 --- spacy/lang/ja/__init__.py | 203 +++++++++++++------------- spacy/lang/ja/bunsetu.py | 144 ------------------ spacy/tests/lang/ja/test_tokenizer.py | 53 ++++++- 3 files changed, 152 insertions(+), 248 deletions(-) delete mode 100644 spacy/lang/ja/bunsetu.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index a7ad0846e..fb8b9d7fe 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -20,12 +20,7 @@ from ... import util # Hold the attributes we need with convenient names -DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) - -# Handling for multiple spaces in a row is somewhat awkward, this simplifies -# the flow by creating a dummy with the same interface. -DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) -DummySpace = DummyNode(" ", " ", " ") +DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]) def try_sudachi_import(split_mode="A"): @@ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"): ) -def resolve_pos(orth, pos, next_pos): +def resolve_pos(orth, tag, next_tag): """If necessary, add a field to the POS tag for UD mapping. Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context @@ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos): # Some tokens have their UD tag decided based on the POS of the following # token. - # orth based rules - if pos[0] in TAG_ORTH_MAP: - orth_map = TAG_ORTH_MAP[pos[0]] + # apply orth based mapping + if tag in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[tag] if orth in orth_map: - return orth_map[orth], None + return orth_map[orth], None # current_pos, next_pos - # tag bi-gram mapping - if next_pos: - tag_bigram = pos[0], next_pos[0] + # apply tag bi-gram mapping + if next_tag: + tag_bigram = tag, next_tag if tag_bigram in TAG_BIGRAM_MAP: - bipos = TAG_BIGRAM_MAP[tag_bigram] - if bipos[0] is None: - return TAG_MAP[pos[0]][POS], bipos[1] + current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram] + if current_pos is None: # apply tag uni-gram mapping for current_pos + return TAG_MAP[tag][POS], next_pos # only next_pos is identified by tag bi-gram mapping else: - return bipos + return current_pos, next_pos - return TAG_MAP[pos[0]][POS], None + # apply tag uni-gram mapping + return TAG_MAP[tag][POS], None -# Use a mapping of paired punctuation to avoid splitting quoted sentences. -pairpunct = {'「':'」', '『': '』', '【': '】'} - - -def separate_sentences(doc): - """Given a doc, mark tokens that start sentences based on Unidic tags. - """ - - stack = [] # save paired punctuation - - for i, token in enumerate(doc[:-2]): - # Set all tokens after the first to false by default. This is necessary - # for the doc code to be aware we've done sentencization, see - # `is_sentenced`. - token.sent_start = (i == 0) - if token.tag_: - if token.tag_ == "補助記号-括弧開": - ts = str(token) - if ts in pairpunct: - stack.append(pairpunct[ts]) - elif stack and ts == stack[-1]: - stack.pop() - - if token.tag_ == "補助記号-句点": - next_token = doc[i+1] - if next_token.tag_ != token.tag_ and not stack: - next_token.sent_start = True - - -def get_dtokens(tokenizer, text): - tokens = tokenizer.tokenize(text) - words = [] - for ti, token in enumerate(tokens): - tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']) - inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*']) - dtoken = DetailedToken( - token.surface(), - (tag, inf), - token.dictionary_form()) - if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白': - # don't add multiple space tokens in a row - continue - words.append(dtoken) - - # remove empty tokens. These can be produced with characters like … that - # Sudachi normalizes internally. - words = [ww for ww in words if len(ww.surface) > 0] - return words - - -def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): +def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): + # Compare the content of tokens and text, first words = [x.surface for x in dtokens] if "".join("".join(words).split()) != "".join(text.split()): raise ValueError(Errors.E194.format(text=text, words=words)) - text_words = [] - text_lemmas = [] - text_tags = [] + + text_dtokens = [] text_spaces = [] text_pos = 0 # handle empty and whitespace-only texts if len(words) == 0: - return text_words, text_lemmas, text_tags, text_spaces + return text_dtokens, text_spaces elif len([word for word in words if not word.isspace()]) == 0: assert text.isspace() - text_words = [text] - text_lemmas = [text] - text_tags = [gap_tag] + text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)] text_spaces = [False] - return text_words, text_lemmas, text_tags, text_spaces - # normalize words to remove all whitespace tokens - norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) - # align words with text - for word, dtoken in zip(norm_words, norm_dtokens): + return text_dtokens, text_spaces + + # align words and dtokens by referring text, and insert gap tokens for the space char spans + for word, dtoken in zip(words, dtokens): + # skip all space tokens + if word.isspace(): + continue try: word_start = text[text_pos:].index(word) except ValueError: raise ValueError(Errors.E194.format(text=text, words=words)) + + # space token if word_start > 0: w = text[text_pos:text_pos + word_start] - text_words.append(w) - text_lemmas.append(w) - text_tags.append(gap_tag) + text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None)) text_spaces.append(False) text_pos += word_start - text_words.append(word) - text_lemmas.append(dtoken.lemma) - text_tags.append(dtoken.pos) + + # content word + text_dtokens.append(dtoken) text_spaces.append(False) text_pos += len(word) + # poll a space char after the word if text_pos < len(text) and text[text_pos] == " ": text_spaces[-1] = True text_pos += 1 + + # trailing space token if text_pos < len(text): w = text[text_pos:] - text_words.append(w) - text_lemmas.append(w) - text_tags.append(gap_tag) + text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None)) text_spaces.append(False) - return text_words, text_lemmas, text_tags, text_spaces + + return text_dtokens, text_spaces class JapaneseTokenizer(DummyTokenizer): @@ -191,29 +139,78 @@ class JapaneseTokenizer(DummyTokenizer): self.tokenizer = try_sudachi_import(self.split_mode) def __call__(self, text): - dtokens = get_dtokens(self.tokenizer, text) + # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces + sudachipy_tokens = self.tokenizer.tokenize(text) + dtokens = self._get_dtokens(sudachipy_tokens) + dtokens, spaces = get_dtokens_and_spaces(dtokens, text) - words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text) + # create Doc with tag bi-gram based part-of-speech identification rules + words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6 + sub_tokens_list = list(sub_tokens_list) doc = Doc(self.vocab, words=words, spaces=spaces) - next_pos = None - for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)): - token.tag_ = unidic_tag[0] - if next_pos: + next_pos = None # for bi-gram rules + for idx, (token, dtoken) in enumerate(zip(doc, dtokens)): + token.tag_ = dtoken.tag + if next_pos: # already identified in previous iteration token.pos = next_pos next_pos = None else: token.pos, next_pos = resolve_pos( token.orth_, - unidic_tag, - unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None + dtoken.tag, + tags[idx + 1] if idx + 1 < len(tags) else None ) - # if there's no lemma info (it's an unk) just use the surface - token.lemma_ = lemma - doc.user_data["unidic_tags"] = unidic_tags + token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface + + doc.user_data["inflections"] = inflections + doc.user_data["reading_forms"] = readings + doc.user_data["sub_tokens"] = sub_tokens_list return doc + def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True): + sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None + dtokens = [ + DetailedToken( + token.surface(), # orth + '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']), # tag + ','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']), # inf + token.dictionary_form(), # lemma + token.reading_form(), # user_data['reading_forms'] + sub_tokens_list[idx] if sub_tokens_list else None, # user_data['sub_tokens'] + ) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0 + # remove empty tokens which can be produced with characters like … that + ] + # Sudachi normalizes internally and outputs each space char as a token. + # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens + return [ + t for idx, t in enumerate(dtokens) if + idx == 0 or + not t.surface.isspace() or t.tag != '空白' or + not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白' + ] + + def _get_sub_tokens(self, sudachipy_tokens): + if self.split_mode is None or self.split_mode == "A": # do nothing for default split mode + return None + + sub_tokens_list = [] # list of (list of list of DetailedToken | None) + for token in sudachipy_tokens: + sub_a = token.split(self.tokenizer.SplitMode.A) + if len(sub_a) == 1: # no sub tokens + sub_tokens_list.append(None) + elif self.split_mode == "B": + sub_tokens_list.append([self._get_dtokens(sub_a, False)]) + else: # "C" + sub_b = token.split(self.tokenizer.SplitMode.B) + if len(sub_a) == len(sub_b): + dtokens = self._get_dtokens(sub_a, False) + sub_tokens_list.append([dtokens, dtokens]) + else: + sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)]) + return sub_tokens_list + def _get_config(self): config = OrderedDict( ( diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py deleted file mode 100644 index 7c3eee336..000000000 --- a/spacy/lang/ja/bunsetu.py +++ /dev/null @@ -1,144 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .stop_words import STOP_WORDS - - -POS_PHRASE_MAP = { - "NOUN": "NP", - "NUM": "NP", - "PRON": "NP", - "PROPN": "NP", - - "VERB": "VP", - - "ADJ": "ADJP", - - "ADV": "ADVP", - - "CCONJ": "CCONJP", -} - - -# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] -def yield_bunsetu(doc, debug=False): - bunsetu = [] - bunsetu_may_end = False - phrase_type = None - phrase = None - prev = None - prev_tag = None - prev_dep = None - prev_head = None - for t in doc: - pos = t.pos_ - pos_type = POS_PHRASE_MAP.get(pos, None) - tag = t.tag_ - dep = t.dep_ - head = t.head.i - if debug: - print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) - - # DET is always an individual bunsetu - if pos == "DET": - if bunsetu: - yield bunsetu, phrase_type, phrase - yield [t], None, None - bunsetu = [] - bunsetu_may_end = False - phrase_type = None - phrase = None - - # PRON or Open PUNCT always splits bunsetu - elif tag == "補助記号-括弧開": - if bunsetu: - yield bunsetu, phrase_type, phrase - bunsetu = [t] - bunsetu_may_end = True - phrase_type = None - phrase = None - - # bunsetu head not appeared - elif phrase_type is None: - if bunsetu and prev_tag == "補助記号-読点": - yield bunsetu, phrase_type, phrase - bunsetu = [] - bunsetu_may_end = False - phrase_type = None - phrase = None - bunsetu.append(t) - if pos_type: # begin phrase - phrase = [t] - phrase_type = pos_type - if pos_type in {"ADVP", "CCONJP"}: - bunsetu_may_end = True - - # entering new bunsetu - elif pos_type and ( - pos_type != phrase_type or # different phrase type arises - bunsetu_may_end # same phrase type but bunsetu already ended - ): - # exceptional case: NOUN to VERB - if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: - bunsetu.append(t) - phrase_type = "VP" - phrase.append(t) - # exceptional case: VERB to NOUN - elif phrase_type == "VP" and pos_type == "NP" and ( - prev_dep == 'compound' and prev_head == t.i or - dep == 'compound' and prev == head or - prev_dep == 'nmod' and prev_head == t.i - ): - bunsetu.append(t) - phrase_type = "NP" - phrase.append(t) - else: - yield bunsetu, phrase_type, phrase - bunsetu = [t] - bunsetu_may_end = False - phrase_type = pos_type - phrase = [t] - - # NOUN bunsetu - elif phrase_type == "NP": - bunsetu.append(t) - if not bunsetu_may_end and (( - (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): - phrase.append(t) - else: - bunsetu_may_end = True - - # VERB bunsetu - elif phrase_type == "VP": - bunsetu.append(t) - if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': - phrase.append(t) - else: - bunsetu_may_end = True - - # ADJ bunsetu - elif phrase_type == "ADJP" and tag != '連体詞': - bunsetu.append(t) - if not bunsetu_may_end and (( - pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): - phrase.append(t) - else: - bunsetu_may_end = True - - # other bunsetu - else: - bunsetu.append(t) - - prev = t.i - prev_tag = t.tag_ - prev_dep = t.dep_ - prev_head = head - - if bunsetu: - yield bunsetu, phrase_type, phrase diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 26be5cf59..651e906eb 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS -from spacy.lang.ja import Japanese +from spacy.lang.ja import Japanese, DetailedToken # fmt: off TOKENIZER_TESTS = [ @@ -96,6 +96,57 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): assert len(nlp_c(text)) == len_c +@pytest.mark.parametrize("text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", + [ + ( + "選挙管理委員会", + [None, None, None, None], + [None, None, [ + [ + DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None), + DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None), + ] + ]], + [[ + [ + DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None), + DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None), + DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None), + DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None), + ], [ + DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None), + DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None), + DetailedToken(surface='委員会', tag='名詞-普通名詞-一般', inf='', lemma='委員会', reading='イインカイ', sub_tokens=None), + ] + ]] + ), + ] +) +def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c): + nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) + nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) + nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) + + assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a + assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a + assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b + assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c + + +@pytest.mark.parametrize("text,inflections,reading_forms", + [ + ( + "取ってつけた", + ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), + ("トッ", "テ", "ツケ", "タ"), + ), + ] +) +def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms): + assert ja_tokenizer(text).user_data["inflections"] == inflections + assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms + + def test_ja_tokenizer_emptyish_texts(ja_tokenizer): doc = ja_tokenizer("") assert len(doc) == 0 From bc1cb30b2157b2e3fe63ec42dad9650cb369e3c3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 22 Jun 2020 14:37:24 +0200 Subject: [PATCH 04/43] Add warnings example in v2.3 migration guide (#5627) --- website/docs/usage/v2-3.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md index d59b50a6e..378b1ec34 100644 --- a/website/docs/usage/v2-3.md +++ b/website/docs/usage/v2-3.md @@ -161,10 +161,18 @@ debugging your tokenizer configuration. spaCy's custom warnings have been replaced with native Python [`warnings`](https://docs.python.org/3/library/warnings.html). Instead of -setting `SPACY_WARNING_IGNORE`, use the -[`warnings` filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) +setting `SPACY_WARNING_IGNORE`, use the [`warnings` +filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) to manage warnings. +```diff +import spacy ++ import warnings + +- spacy.errors.SPACY_WARNING_IGNORE.append('W007') ++ warnings.filterwarnings("ignore", message=r"\[W007\]", category=UserWarning) +``` + #### Normalization tables The normalization tables have moved from the language data in From 0ef78bad93a5ebaca783693cdf1e948e15cd7a86 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Mon, 22 Jun 2020 23:53:58 -0700 Subject: [PATCH 05/43] contribute (#5632) --- .github/contributors/richardliaw.md | 106 ++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/richardliaw.md diff --git a/.github/contributors/richardliaw.md b/.github/contributors/richardliaw.md new file mode 100644 index 000000000..2af4ce840 --- /dev/null +++ b/.github/contributors/richardliaw.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Richard Liaw | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 06/22/2020 | +| GitHub username | richardliaw | +| Website (optional) | | \ No newline at end of file From d94e961f14af61dba4f01e0e2821217f38b85fbf Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 23 Jun 2020 13:29:51 +0200 Subject: [PATCH 06/43] Fix polarity of Token.is_oov and Lexeme.is_oov (#5634) Fix `Token.is_oov` and `Lexeme.is_oov` so they return `True` when the lexeme does **not** have a vector. --- spacy/lexeme.pyx | 2 +- spacy/tests/vocab_vectors/test_vectors.py | 6 +++--- spacy/tokens/token.pyx | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1df516dcb..8042098d7 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -349,7 +349,7 @@ cdef class Lexeme: @property def is_oov(self): """RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" - return self.orth in self.vocab.vectors + return self.orth not in self.vocab.vectors property is_stop: """RETURNS (bool): Whether the lexeme is a stop word.""" diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 576ca93d2..b31cef1f2 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -376,6 +376,6 @@ def test_vector_is_oov(): data[1] = 2.0 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) - assert vocab["cat"].is_oov is True - assert vocab["dog"].is_oov is True - assert vocab["hamster"].is_oov is False + assert vocab["cat"].is_oov is False + assert vocab["dog"].is_oov is False + assert vocab["hamster"].is_oov is True diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 45deebc93..8d3406bae 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -923,7 +923,7 @@ cdef class Token: @property def is_oov(self): """RETURNS (bool): Whether the token is out-of-vocabulary.""" - return self.c.lex.orth in self.vocab.vectors + return self.c.lex.orth not in self.vocab.vectors @property def is_stop(self): From 7ce451c211ef8c528a14ba4c5da3c380e534c350 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 23 Jun 2020 16:48:59 +0200 Subject: [PATCH 07/43] Extend what's new in v2.3 with vocab / is_oov (#5635) --- website/docs/usage/v2-3.md | 45 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md index 378b1ec34..c56b44267 100644 --- a/website/docs/usage/v2-3.md +++ b/website/docs/usage/v2-3.md @@ -182,6 +182,51 @@ If you're adding data for a new language, the normalization table should be added to `spacy-lookups-data`. See [adding norm exceptions](/usage/adding-languages#norm-exceptions). +#### No preloaded lexemes/vocab for models with vectors + +To reduce the initial loading time, the lexemes in `nlp.vocab` are no longer +loaded on initialization for models with vectors. As you process texts, the +lexemes will be added to the vocab automatically, just as in models without +vectors. + +To see the number of unique vectors and number of words with vectors, see +`nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000` +unique vectors and `684830` words with vectors: + +```python +{ + 'width': 300, + 'vectors': 20000, + 'keys': 684830, + 'name': 'en_core_web_md.vectors' +} +``` + +If required, for instance if you are working directly with word vectors rather +than processing texts, you can load all lexemes for words with vectors at once: + +```python +for orth in nlp.vocab.vectors: + _ = nlp.vocab[orth] +``` + +#### Lexeme.is_oov and Token.is_oov + + + +Due to a bug, the values for `is_oov` are reversed in v2.3.0, but this will be +fixed in the next patch release v2.3.1. + + + +In v2.3, `Lexeme.is_oov` and `Token.is_oov` are `True` if the lexeme does not +have a word vector. This is equivalent to `token.orth not in +nlp.vocab.vectors`. + +Previously in v2.2, `is_oov` corresponded to whether a lexeme had stored +probability and cluster features. The probability and cluster features are no +longer included in the provided medium and large models (see the next section). + #### Probability and cluster features > #### Load and save extra prob lookups table From 6fe6e761de836550aa71105e1dfd75335612fa82 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 23 Jun 2020 23:21:11 +0200 Subject: [PATCH 08/43] Skip vocab in component config overrides (#5624) --- spacy/util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 5362952e2..923f56b31 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -208,6 +208,10 @@ def load_model_from_path(model_path, meta=False, **overrides): pipeline = nlp.Defaults.pipe_names elif pipeline in (False, None): pipeline = [] + # skip "vocab" from overrides in component initialization since vocab is + # already configured from overrides when nlp is initialized above + if "vocab" in overrides: + del overrides["vocab"] for name in pipeline: if name not in disable: config = meta.get("pipeline_args", {}).get(name, {}) From fd4287c178feea0ab4a50e70c86f4583c9b886c6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 24 Jun 2020 10:26:12 +0200 Subject: [PATCH 09/43] Fix backslashes in warnings config diff (#5640) Fix backslashes in warnings config diff in v2.3 migration section. --- website/docs/usage/v2-3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md index c56b44267..e6b88c779 100644 --- a/website/docs/usage/v2-3.md +++ b/website/docs/usage/v2-3.md @@ -170,7 +170,7 @@ import spacy + import warnings - spacy.errors.SPACY_WARNING_IGNORE.append('W007') -+ warnings.filterwarnings("ignore", message=r"\[W007\]", category=UserWarning) ++ warnings.filterwarnings("ignore", message=r"\\[W007\\]", category=UserWarning) ``` #### Normalization tables From b7107ac89feee7f1aa1381d3c2978d09919288c2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 26 Jun 2020 09:23:21 +0200 Subject: [PATCH 10/43] Disregard special tag _SP in check for new tag map (#5641) * Skip special tag _SP in check for new tag map In `Tagger.begin_training()` check for new tags aside from `_SP` in the new tag map initialized from the provided gold tuples when determining whether to reinitialize the morphology with the new tag map. * Simplify _SP check --- spacy/pipeline/pipes.pyx | 4 ++-- spacy/tests/pipeline/test_tagger.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 3f40cb545..8f07bf8f7 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -528,10 +528,10 @@ class Tagger(Pipe): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} - if "_SP" in orig_tag_map: - new_tag_map["_SP"] = orig_tag_map["_SP"] cdef Vocab vocab = self.vocab if new_tag_map: + if "_SP" in orig_tag_map: + new_tag_map["_SP"] = orig_tag_map["_SP"] vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index a5bda9090..1681ffeaa 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import pytest from spacy.language import Language +from spacy.symbols import POS, NOUN def test_label_types(): @@ -11,3 +12,16 @@ def test_label_types(): nlp.get_pipe("tagger").add_label("A") with pytest.raises(ValueError): nlp.get_pipe("tagger").add_label(9) + + +def test_tagger_begin_training_tag_map(): + """Test that Tagger.begin_training() without gold tuples does not clobber + the tag map.""" + nlp = Language() + tagger = nlp.create_pipe("tagger") + orig_tag_count = len(tagger.labels) + tagger.add_label("A", {"POS": "NOUN"}) + nlp.add_pipe(tagger) + nlp.begin_training() + assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN} + assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels) From 90c7eb0e2f51eb07582c4d9e3fcaed1fdb51c4bc Mon Sep 17 00:00:00 2001 From: PluieElectrique <41453973+PluieElectrique@users.noreply.github.com> Date: Fri, 26 Jun 2020 12:09:10 +0000 Subject: [PATCH 11/43] Reduce memory usage of Lookup's BloomFilter (#5606) * Reduce memory usage of Lookup's BloomFilter * Remove extra Table update --- .github/contributors/PluieElectrique.md | 106 ++++++++++++++++++++++++ spacy/lookups.py | 5 +- 2 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 .github/contributors/PluieElectrique.md diff --git a/.github/contributors/PluieElectrique.md b/.github/contributors/PluieElectrique.md new file mode 100644 index 000000000..97e01650a --- /dev/null +++ b/.github/contributors/PluieElectrique.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Pluie | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-18 | +| GitHub username | PluieElectrique | +| Website (optional) | | diff --git a/spacy/lookups.py b/spacy/lookups.py index 1fa29bdfe..d4947be9f 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -120,8 +120,7 @@ class Lookups(object): """ self._tables = OrderedDict() for key, value in srsly.msgpack_loads(bytes_data).items(): - self._tables[key] = Table(key) - self._tables[key].update(value) + self._tables[key] = Table(key, value) return self def to_disk(self, path, filename="lookups.bin", **kwargs): @@ -192,7 +191,7 @@ class Table(OrderedDict): self.name = name # Assume a default size of 1M items self.default_size = 1e6 - size = len(data) if data and len(data) > 0 else self.default_size + size = max(len(data), 1) if data is not None else self.default_size self.bloom = BloomFilter.from_error_rate(size) if data: self.update(data) From c4d02094726a7e92325f9fc0911fcfad7f43db75 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 26 Jun 2020 14:12:29 +0200 Subject: [PATCH 12/43] Extend v2.3 migration guide (#5653) * Extend preloaded vocab section * Add section on tag maps --- website/docs/usage/v2-3.md | 78 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md index e6b88c779..b6c4d7dfb 100644 --- a/website/docs/usage/v2-3.md +++ b/website/docs/usage/v2-3.md @@ -182,12 +182,12 @@ If you're adding data for a new language, the normalization table should be added to `spacy-lookups-data`. See [adding norm exceptions](/usage/adding-languages#norm-exceptions). -#### No preloaded lexemes/vocab for models with vectors +#### No preloaded vocab for models with vectors To reduce the initial loading time, the lexemes in `nlp.vocab` are no longer loaded on initialization for models with vectors. As you process texts, the -lexemes will be added to the vocab automatically, just as in models without -vectors. +lexemes will be added to the vocab automatically, just as in small models +without vectors. To see the number of unique vectors and number of words with vectors, see `nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000` @@ -210,6 +210,20 @@ for orth in nlp.vocab.vectors: _ = nlp.vocab[orth] ``` +If your workflow previously iterated over `nlp.vocab`, a similar alternative +is to iterate over words with vectors instead: + +```diff +- lexemes = [w for w in nlp.vocab] ++ lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors] +``` + +Be aware that the set of preloaded lexemes in a v2.2 model is not equivalent to +the set of words with vectors. For English, v2.2 `md/lg` models have 1.3M +provided lexemes but only 685K words with vectors. The vectors have been +updated for most languages in v2.2, but the English models contain the same +vectors for both v2.2 and v2.3. + #### Lexeme.is_oov and Token.is_oov @@ -254,6 +268,28 @@ model vocab, which will take a few seconds on initial loading. When you save this model after loading the `prob` table, the full `prob` table will be saved as part of the model vocab. +To load the probability table into a provided model, first make sure you have +`spacy-lookups-data` installed. To load the table, remove the empty provided +`lexeme_prob` table and then access `Lexeme.prob` for any word to load the +table from `spacy-lookups-data`: + +```diff ++ # prerequisite: pip install spacy-lookups-data +import spacy + +nlp = spacy.load("en_core_web_md") + +# remove the empty placeholder prob table ++ if nlp.vocab.lookups_extra.has_table("lexeme_prob"): ++ nlp.vocab.lookups_extra.remove_table("lexeme_prob") + +# access any `.prob` to load the full table into the model +assert nlp.vocab["a"].prob == -3.9297883511 + +# if desired, save this model with the probability table included +nlp.to_disk("/path/to/model") +``` + If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as part of a new model, add the data to [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) under @@ -271,3 +307,39 @@ When you initialize a new model with [`spacy init-model`](/api/cli#init-model), the `prob` table from `spacy-lookups-data` may be loaded as part of the initialization. If you'd like to omit this extra data as in spaCy's provided v2.3 models, use the new flag `--omit-extra-lookups`. + +#### Tag maps in provided models vs. blank models + +The tag maps in the provided models may differ from the tag maps in the spaCy +library. You can access the tag map in a loaded model under +`nlp.vocab.morphology.tag_map`. + +The tag map from `spacy.lang.lg.tag_map` is still used when a blank model is +initialized. If you want to provide an alternate tag map, update +`nlp.vocab.morphology.tag_map` after initializing the model or if you're using +the [train CLI](/api/cli#train), you can use the new `--tag-map-path` option to +provide in the tag map as a JSON dict. + +If you want to export a tag map from a provided model for use with the train +CLI, you can save it as a JSON dict. To only use string keys as required by +JSON and to make it easier to read and edit, any internal integer IDs need to +be converted back to strings: + +```python +import spacy +import srsly + +nlp = spacy.load("en_core_web_sm") +tag_map = {} + +# convert any integer IDs to strings for JSON +for tag, morph in nlp.vocab.morphology.tag_map.items(): + tag_map[tag] = {} + for feat, val in morph.items(): + feat = nlp.vocab.strings.as_string(feat) + if not isinstance(val, bool): + val = nlp.vocab.strings.as_string(val) + tag_map[tag][feat] = val + +srsly.write_json("tag_map.json", tag_map) +``` From 167df42cb6bdda3edf05cbef44f0edb9b73f05f1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 29 Jun 2020 14:16:57 +0200 Subject: [PATCH 13/43] Move lemmatizer is_base_form to language settings (#5663) Move `Lemmatizer.is_base_form` to the language settings so that each language can provide a language-specific method as `LanguageDefaults.is_base_form`. The existing English-specific `Lemmatizer.is_base_form` is moved to `EnglishDefaults`. --- spacy/lang/en/__init__.py | 36 ++++++++++++++++++++ spacy/language.py | 3 +- spacy/lemmatizer.py | 39 ++-------------------- spacy/tests/regression/test_issue1-1000.py | 3 +- spacy/tests/test_lemmatizer.py | 12 +++++++ 5 files changed, 55 insertions(+), 38 deletions(-) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 4304b3c6a..d52f3dfd8 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -18,6 +18,41 @@ def _return_en(_): return "en" +def en_is_base_form(univ_pos, morphology=None): + """ + Check whether we're dealing with an uninflected paradigm, so we can + avoid lemmatization entirely. + + univ_pos (unicode / int): The token's universal part-of-speech tag. + morphology (dict): The token's morphological features following the + Universal Dependencies scheme. + """ + if morphology is None: + morphology = {} + if univ_pos == "noun" and morphology.get("Number") == "sing": + return True + elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": + return True + # This maps 'VBP' to base form -- probably just need 'IS_BASE' + # morphology + elif univ_pos == "verb" and ( + morphology.get("VerbForm") == "fin" + and morphology.get("Tense") == "pres" + and morphology.get("Number") is None + ): + return True + elif univ_pos == "adj" and morphology.get("Degree") == "pos": + return True + elif morphology.get("VerbForm") == "inf": + return True + elif morphology.get("VerbForm") == "none": + return True + elif morphology.get("Degree") == "pos": + return True + else: + return False + + class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) @@ -26,6 +61,7 @@ class EnglishDefaults(Language.Defaults): tag_map = TAG_MAP stop_words = STOP_WORDS morph_rules = MORPH_RULES + is_base_form = en_is_base_form syntax_iterators = SYNTAX_ITERATORS single_orth_variants = [ {"tags": ["NFP"], "variants": ["…", "..."]}, diff --git a/spacy/language.py b/spacy/language.py index 2058def8a..faa0447a4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -46,7 +46,7 @@ class BaseDefaults(object): def create_lemmatizer(cls, nlp=None, lookups=None): if lookups is None: lookups = cls.create_lookups(nlp=nlp) - return Lemmatizer(lookups=lookups) + return Lemmatizer(lookups=lookups, is_base_form=cls.is_base_form) @classmethod def create_lookups(cls, nlp=None): @@ -120,6 +120,7 @@ class BaseDefaults(object): tokenizer_exceptions = {} stop_words = set() morph_rules = {} + is_base_form = None lex_attr_getters = LEX_ATTRS syntax_iterators = {} resources = {} diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 1f0f0da3f..f72eae128 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -21,7 +21,7 @@ class Lemmatizer(object): def load(cls, *args, **kwargs): raise NotImplementedError(Errors.E172) - def __init__(self, lookups, *args, **kwargs): + def __init__(self, lookups, *args, is_base_form=None, **kwargs): """Initialize a Lemmatizer. lookups (Lookups): The lookups object containing the (optional) tables @@ -31,6 +31,7 @@ class Lemmatizer(object): if args or kwargs or not isinstance(lookups, Lookups): raise ValueError(Errors.E173) self.lookups = lookups + self.is_base_form = is_base_form def __call__(self, string, univ_pos, morphology=None): """Lemmatize a string. @@ -51,7 +52,7 @@ class Lemmatizer(object): if univ_pos in ("", "eol", "space"): return [string.lower()] # See Issue #435 for example of where this logic is requied. - if self.is_base_form(univ_pos, morphology): + if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology): return [string.lower()] index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) @@ -69,40 +70,6 @@ class Lemmatizer(object): ) return lemmas - def is_base_form(self, univ_pos, morphology=None): - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - - univ_pos (unicode / int): The token's universal part-of-speech tag. - morphology (dict): The token's morphological features following the - Universal Dependencies scheme. - """ - if morphology is None: - morphology = {} - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif morphology.get("VerbForm") == "inf": - return True - elif morphology.get("VerbForm") == "none": - return True - elif morphology.get("Degree") == "pos": - return True - else: - return False - def noun(self, string, morphology=None): return self(string, "noun", morphology) diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 6d88d68c2..38a99371e 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -11,6 +11,7 @@ from spacy.language import Language from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups from spacy.tokens import Doc, Span +from spacy.lang.en import EnglishDefaults from ..util import get_doc, make_tempdir @@ -172,7 +173,7 @@ def test_issue595(): lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) - lemmatizer = Lemmatizer(lookups) + lemmatizer = Lemmatizer(lookups, is_base_form=EnglishDefaults.is_base_form) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=words) doc[2].tag_ = "VB" diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index fce3772c4..e7736b042 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -5,6 +5,7 @@ import pytest from spacy.tokens import Doc from spacy.language import Language from spacy.lookups import Lookups +from spacy.lemmatizer import Lemmatizer def test_lemmatizer_reflects_lookups_changes(): @@ -47,3 +48,14 @@ def test_tagger_warns_no_lookups(): with pytest.warns(None) as record: nlp.begin_training() assert not record.list + + +def test_lemmatizer_without_is_base_form_implementation(): + # Norwegian example from #5658 + lookups = Lookups() + lookups.add_table("lemma_rules", {"noun": []}) + lookups.add_table("lemma_index", {"noun": {}}) + lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}}) + + lemmatizer = Lemmatizer(lookups, is_base_form=None) + assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"] From 1dd38191ecf684caa967e54e70452f5150551de5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 29 Jun 2020 14:20:26 +0200 Subject: [PATCH 14/43] Convert custom user_data to token extension format for Japanese tokenizer (#5652) * Convert custom user_data to token extension format Convert the user_data values so that they can be loaded as custom token extensions for `inflection`, `reading_form`, `sub_tokens`, and `lemma`. * Reset Underscore state in ja tokenizer tests --- spacy/lang/ja/__init__.py | 13 +++++----- spacy/tests/lang/ja/test_tokenizer.py | 37 +++++++++++++++++++++------ 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index fb8b9d7fe..f356f3d64 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -145,8 +145,7 @@ class JapaneseTokenizer(DummyTokenizer): dtokens, spaces = get_dtokens_and_spaces(dtokens, text) # create Doc with tag bi-gram based part-of-speech identification rules - words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6 - sub_tokens_list = list(sub_tokens_list) + words = [dtoken.surface for dtoken in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) next_pos = None # for bi-gram rules for idx, (token, dtoken) in enumerate(zip(doc, dtokens)): @@ -158,14 +157,14 @@ class JapaneseTokenizer(DummyTokenizer): token.pos, next_pos = resolve_pos( token.orth_, dtoken.tag, - tags[idx + 1] if idx + 1 < len(tags) else None + dtokens[idx + 1].tag if idx + 1 < len(dtokens) else None ) # if there's no lemma info (it's an unk) just use the surface token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface - - doc.user_data["inflections"] = inflections - doc.user_data["reading_forms"] = readings - doc.user_data["sub_tokens"] = sub_tokens_list + doc.user_data[('._.', 'inflection', token.idx, None)] = dtoken.inf + doc.user_data[('._.', 'reading_form', token.idx, None)] = dtoken.reading + doc.user_data[('._.', 'sub_tokens', token.idx, None)] = dtoken.sub_tokens + doc.user_data[('._.', 'lemma', token.idx, None)] = token.lemma_ return doc diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 651e906eb..fad5e1390 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -5,6 +5,18 @@ import pytest from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS from spacy.lang.ja import Japanese, DetailedToken +from spacy.tokens import Token +from spacy.tokens.underscore import Underscore + + +@pytest.fixture(scope="function", autouse=True) +def clean_underscore(): + # reset the Underscore object after the test, to avoid having state copied across tests + yield + Underscore.doc_extensions = {} + Underscore.span_extensions = {} + Underscore.token_extensions = {} + # fmt: off TOKENIZER_TESTS = [ @@ -127,24 +139,33 @@ def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_toke nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) - assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a - assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a - assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b - assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c + doc = ja_tokenizer(text) + doc_a = nlp_a(text) + doc_b = nlp_b(text) + doc_c = nlp_c(text) + + Token.set_extension("sub_tokens", default="") + assert [t._.sub_tokens for t in doc] == sub_tokens_list_a + assert [t._.sub_tokens for t in doc_a] == sub_tokens_list_a + assert [t._.sub_tokens for t in doc_b] == sub_tokens_list_b + assert [t._.sub_tokens for t in doc_c] == sub_tokens_list_c @pytest.mark.parametrize("text,inflections,reading_forms", [ ( "取ってつけた", - ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), - ("トッ", "テ", "ツケ", "タ"), + ["五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"], + ["トッ", "テ", "ツケ", "タ"], ), ] ) def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms): - assert ja_tokenizer(text).user_data["inflections"] == inflections - assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms + Token.set_extension("inflection", default="") + Token.set_extension("reading_form", default="") + doc = ja_tokenizer(text) + assert [t._.inflection for t in doc] == inflections + assert [t._.reading_form for t in doc] == reading_forms def test_ja_tokenizer_emptyish_texts(ja_tokenizer): From 2d715451a2215bf589da5e7d2c7a0234d05cbbc8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2020 14:34:15 +0200 Subject: [PATCH 15/43] Revert "Convert custom user_data to token extension format for Japanese tokenizer (#5652)" (#5665) This reverts commit 1dd38191ecf684caa967e54e70452f5150551de5. --- spacy/lang/ja/__init__.py | 13 +++++----- spacy/tests/lang/ja/test_tokenizer.py | 37 ++++++--------------------- 2 files changed, 15 insertions(+), 35 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index f356f3d64..fb8b9d7fe 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -145,7 +145,8 @@ class JapaneseTokenizer(DummyTokenizer): dtokens, spaces = get_dtokens_and_spaces(dtokens, text) # create Doc with tag bi-gram based part-of-speech identification rules - words = [dtoken.surface for dtoken in dtokens] + words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6 + sub_tokens_list = list(sub_tokens_list) doc = Doc(self.vocab, words=words, spaces=spaces) next_pos = None # for bi-gram rules for idx, (token, dtoken) in enumerate(zip(doc, dtokens)): @@ -157,14 +158,14 @@ class JapaneseTokenizer(DummyTokenizer): token.pos, next_pos = resolve_pos( token.orth_, dtoken.tag, - dtokens[idx + 1].tag if idx + 1 < len(dtokens) else None + tags[idx + 1] if idx + 1 < len(tags) else None ) # if there's no lemma info (it's an unk) just use the surface token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface - doc.user_data[('._.', 'inflection', token.idx, None)] = dtoken.inf - doc.user_data[('._.', 'reading_form', token.idx, None)] = dtoken.reading - doc.user_data[('._.', 'sub_tokens', token.idx, None)] = dtoken.sub_tokens - doc.user_data[('._.', 'lemma', token.idx, None)] = token.lemma_ + + doc.user_data["inflections"] = inflections + doc.user_data["reading_forms"] = readings + doc.user_data["sub_tokens"] = sub_tokens_list return doc diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index fad5e1390..651e906eb 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -5,18 +5,6 @@ import pytest from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS from spacy.lang.ja import Japanese, DetailedToken -from spacy.tokens import Token -from spacy.tokens.underscore import Underscore - - -@pytest.fixture(scope="function", autouse=True) -def clean_underscore(): - # reset the Underscore object after the test, to avoid having state copied across tests - yield - Underscore.doc_extensions = {} - Underscore.span_extensions = {} - Underscore.token_extensions = {} - # fmt: off TOKENIZER_TESTS = [ @@ -139,33 +127,24 @@ def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_toke nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) - doc = ja_tokenizer(text) - doc_a = nlp_a(text) - doc_b = nlp_b(text) - doc_c = nlp_c(text) - - Token.set_extension("sub_tokens", default="") - assert [t._.sub_tokens for t in doc] == sub_tokens_list_a - assert [t._.sub_tokens for t in doc_a] == sub_tokens_list_a - assert [t._.sub_tokens for t in doc_b] == sub_tokens_list_b - assert [t._.sub_tokens for t in doc_c] == sub_tokens_list_c + assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a + assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a + assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b + assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c @pytest.mark.parametrize("text,inflections,reading_forms", [ ( "取ってつけた", - ["五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"], - ["トッ", "テ", "ツケ", "タ"], + ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), + ("トッ", "テ", "ツケ", "タ"), ), ] ) def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms): - Token.set_extension("inflection", default="") - Token.set_extension("reading_form", default="") - doc = ja_tokenizer(text) - assert [t._.inflection for t in doc] == inflections - assert [t._.reading_form for t in doc] == reading_forms + assert ja_tokenizer(text).user_data["inflections"] == inflections + assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms def test_ja_tokenizer_emptyish_texts(ja_tokenizer): From 8b0f7496062cc0570c334778a8d21b1a3408e478 Mon Sep 17 00:00:00 2001 From: Matthias Hertel Date: Tue, 30 Jun 2020 19:58:23 +0200 Subject: [PATCH 16/43] Website: fixed the token span in the text about the rule-based matching example (#5669) * fixed token span in pattern matcher example * contributor agreement --- .github/contributors/hertelm.md | 106 ++++++++++++++++++++++ website/docs/usage/rule-based-matching.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/hertelm.md diff --git a/.github/contributors/hertelm.md b/.github/contributors/hertelm.md new file mode 100644 index 000000000..ba4250bfc --- /dev/null +++ b/.github/contributors/hertelm.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Matthias Hertel | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | June 29, 2020 | +| GitHub username | hertelm | +| Website (optional) | | diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index f7866fe31..252aa8c77 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -122,7 +122,7 @@ for match_id, start, end in matches: ``` The matcher returns a list of `(match_id, start, end)` tuples – in this case, -`[('15578876784678163569', 0, 2)]`, which maps to the span `doc[0:2]` of our +`[('15578876784678163569', 0, 3)]`, which maps to the span `doc[0:3]` of our original document. The `match_id` is the [hash value](/usage/spacy-101#vocab) of the string ID "HelloWorld". To get the string value, you can look up the ID in the [`StringStore`](/api/stringstore). From ff0dbe5c6413b62a40f3268987844f0ce2a34a16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Abella=20Bascar=C3=A1n?= Date: Tue, 30 Jun 2020 20:00:50 +0200 Subject: [PATCH 17/43] Fix in docs: pipe(docs) instead of pipe(texts) (#5680) Very minor fix in docs, specifically in this part: ``` matcher = PhraseMatcher(nlp.vocab) > for doc in matcher.pipe(texts, batch_size=50): > pass ``` `texts` suggests the input is an iterable of strings. I replaced it for `docs`. --- website/docs/api/phrasematcher.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index a72277420..49211174c 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -91,7 +91,7 @@ Match a stream of documents, yielding them in turn. > ```python > from spacy.matcher import PhraseMatcher > matcher = PhraseMatcher(nlp.vocab) -> for doc in matcher.pipe(texts, batch_size=50): +> for doc in matcher.pipe(docs, batch_size=50): > pass > ``` From f2a932a60c09766cda0c4b5c534bf94a7ad09add Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 1 Jul 2020 13:34:35 +0200 Subject: [PATCH 18/43] Update netlify.toml [ci skip] --- netlify.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/netlify.toml b/netlify.toml index be809f1d4..452b5979a 100644 --- a/netlify.toml +++ b/netlify.toml @@ -1,6 +1,8 @@ redirects = [ # Netlify {from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true }, + # Subdomain for branches + {from = "https://nightly.spacy.io/*", to="https://spacy-io-develop.spacy.io/:splat", force = true, status = 200}, # Old subdomains {from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true}, {from = "http://survey.spacy.io/*", to = "https://spacy.io", force = true}, From 6bc643d2e2a0812a4490f9ecc66bb480529a3b8f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 1 Jul 2020 21:34:17 +0200 Subject: [PATCH 19/43] Update netlify.toml [ci skip] --- netlify.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netlify.toml b/netlify.toml index 452b5979a..9cb11ae81 100644 --- a/netlify.toml +++ b/netlify.toml @@ -2,7 +2,7 @@ redirects = [ # Netlify {from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true }, # Subdomain for branches - {from = "https://nightly.spacy.io/*", to="https://spacy-io-develop.spacy.io/:splat", force = true, status = 200}, + {from = "https://nightly.spacy.io/*", to="https://nightly-spacy-io.spacy.io/:splat", force = true, status = 200}, # Old subdomains {from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true}, {from = "http://survey.spacy.io/*", to = "https://spacy.io", force = true}, From 2bd78c39e33b90f788b1121b93b3b098c4c4af10 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 2 Jul 2020 10:36:07 +0200 Subject: [PATCH 20/43] Fix multiple context manages in examples (#5690) --- examples/training/rehearsal.py | 2 +- examples/training/train_ner.py | 2 +- examples/training/train_new_entity_type.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index 24b1cea00..1cdac02aa 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -67,7 +67,7 @@ def main(model_name, unlabelled_loc): pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] sizes = compounding(1.0, 4.0, 1.001) - with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + with nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index ff6029567..f64ba801a 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -59,7 +59,7 @@ def main(model=None, output_dir=None, n_iter=100): pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] # only train NER - with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + with nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index e8ff6802a..a14688012 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -99,7 +99,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] # only train NER - with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + with nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') From 971826a96da9d114a86cbfb8b4bb9ab026abe8e6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 2 Jul 2020 17:10:27 +0200 Subject: [PATCH 21/43] Include git commit in package and model meta (#5694) * Include git commit in package and model meta * Rewrite to read file in setup * Fix file handle --- .gitignore | 1 + MANIFEST.in | 1 + setup.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++ spacy/language.py | 2 ++ 4 files changed, 55 insertions(+) diff --git a/.gitignore b/.gitignore index edcbba4d5..eb6be73dd 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ Pipfile.lock *.egg .eggs MANIFEST +spacy/git_info.py # Temporary files *.~* diff --git a/MANIFEST.in b/MANIFEST.in index 1947b9140..9819c7b70 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,3 +6,4 @@ include bin/spacy include pyproject.toml recursive-exclude spacy/lang *.json recursive-include spacy/lang *.json.gz +recursive-include licenses * diff --git a/setup.py b/setup.py index 62a09aa73..01e372e91 100755 --- a/setup.py +++ b/setup.py @@ -118,6 +118,55 @@ def is_source_release(path): return os.path.exists(os.path.join(path, "PKG-INFO")) +# Include the git version in the build (adapted from NumPy) +# Copyright (c) 2005-2020, NumPy Developers. +# BSD 3-Clause license, see licenses/3rd_party_licenses.txt +def write_git_info_py(filename="spacy/git_info.py"): + def _minimal_ext_cmd(cmd): + # construct minimal environment + env = {} + for k in ["SYSTEMROOT", "PATH", "HOME"]: + v = os.environ.get(k) + if v is not None: + env[k] = v + # LANGUAGE is used on win32 + env["LANGUAGE"] = "C" + env["LANG"] = "C" + env["LC_ALL"] = "C" + out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, env=env) + return out + + git_version = "Unknown" + if os.path.exists(".git"): + try: + out = _minimal_ext_cmd(["git", "rev-parse", "--short", "HEAD"]) + git_version = out.strip().decode("ascii") + except: + pass + elif os.path.exists(filename): + # must be a source distribution, use existing version file + try: + a = open(filename, "r") + lines = a.readlines() + git_version = lines[-1].split('"')[1] + except: + pass + finally: + a.close() + + text = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY +# +GIT_VERSION = "%(git_version)s" +""" + a = open(filename, "w") + try: + a.write( + text % {"git_version": git_version,} + ) + finally: + a.close() + + def clean(path): for name in MOD_NAMES: name = name.replace(".", "/") @@ -140,6 +189,8 @@ def chdir(new_dir): def setup_package(): + write_git_info_py() + root = os.path.abspath(os.path.dirname(__file__)) if len(sys.argv) > 1 and sys.argv[1] == "clean": diff --git a/spacy/language.py b/spacy/language.py index faa0447a4..e9d195453 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop from .errors import Errors, Warnings +from .git_info import GIT_VERSION from . import util from . import about @@ -206,6 +207,7 @@ class Language(object): self._meta.setdefault("email", "") self._meta.setdefault("url", "") self._meta.setdefault("license", "") + self._meta.setdefault("spacy_git_version", GIT_VERSION) self._meta["vectors"] = { "width": self.vocab.vectors_length, "vectors": len(self.vocab.vectors), From a77c4c3465d12f70fc2436b6d3def414082d77a9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 2 Jul 2020 17:11:57 +0200 Subject: [PATCH 22/43] Add strings and ENT_KB_ID to Doc serialization (#5691) * Add strings for all writeable Token attributes to `Doc.to/from_bytes()`. * Add ENT_KB_ID to default attributes. --- spacy/tests/doc/test_doc_api.py | 6 ++++++ spacy/tokens/doc.pyx | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 6801d7844..388cd78fe 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -106,10 +106,16 @@ def test_doc_api_getitem(en_tokenizer): ) def test_doc_api_serialize(en_tokenizer, text): tokens = en_tokenizer(text) + tokens[0].lemma_ = "lemma" + tokens[0].norm_ = "norm" + tokens[0].ent_kb_id_ = "ent_kb_id" new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes()) assert tokens.text == new_tokens.text assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] + assert new_tokens[0].lemma_ == "lemma" + assert new_tokens[0].norm_ == "norm" + assert new_tokens[0].ent_kb_id_ == "ent_kb_id" new_tokens = Doc(tokens.vocab).from_bytes( tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 25a147208..5b03dc5d2 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -892,7 +892,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_bytes """ - array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM] # TODO: ENT_KB_ID ? + array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID] if self.is_tagged: array_head.extend([TAG, POS]) # If doc parsed add head and dep attribute @@ -901,6 +901,14 @@ cdef class Doc: # Otherwise add sent_start else: array_head.append(SENT_START) + strings = set() + for token in self: + strings.add(token.tag_) + strings.add(token.lemma_) + strings.add(token.dep_) + strings.add(token.ent_type_) + strings.add(token.ent_kb_id_) + strings.add(token.norm_) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope @@ -912,6 +920,7 @@ cdef class Doc: "sentiment": lambda: self.sentiment, "tensor": lambda: self.tensor, "cats": lambda: self.cats, + "strings": lambda: list(strings), } for key in kwargs: if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): @@ -942,6 +951,7 @@ cdef class Doc: "sentiment": lambda b: None, "tensor": lambda b: None, "cats": lambda b: None, + "strings": lambda b: None, "user_data_keys": lambda b: None, "user_data_values": lambda b: None, } @@ -965,6 +975,9 @@ cdef class Doc: self.tensor = msg["tensor"] if "cats" not in exclude and "cats" in msg: self.cats = msg["cats"] + if "strings" not in exclude and "strings" in msg: + for s in msg["strings"]: + self.vocab.strings.add(s) start = 0 cdef const LexemeC* lex cdef unicode orth_ From 2fb9bd795da1670e7ed7f3134652cf31aac10a96 Mon Sep 17 00:00:00 2001 From: Matthias Hertel Date: Fri, 3 Jul 2020 10:24:02 +0200 Subject: [PATCH 23/43] Fixed vocabulary in the entity linker training example (#5676) * entity linker training example: model loading changed according to issue 5668 (https://github.com/explosion/spaCy/issues/5668) + vocab_path is a required argument * contributor agreement --- examples/training/train_entity_linker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index 3a8deb7a0..a68007504 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -60,12 +60,12 @@ TRAIN_DATA = sample_train_data() output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int), ) -def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): +def main(kb_path, vocab_path, output_dir=None, n_iter=50): """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. The `vocab` should be the one used during creation of the KB.""" - vocab = Vocab().from_disk(vocab_path) # create blank English model with correct vocab - nlp = spacy.blank("en", vocab=vocab) + nlp = spacy.blank("en") + nlp.vocab.from_disk(vocab_path) nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) From 86d13a9fb84cee2df56a998446d340207dfdbd5f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 3 Jul 2020 13:38:41 +0200 Subject: [PATCH 24/43] Set version to 2.3.1 (#5705) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 91810fa68..cd97fa987 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.3.0" +__version__ = "2.3.1" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 3e78e82a834b42ff165fbb2614cb6f42206ce390 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Jul 2020 15:48:39 +0200 Subject: [PATCH 25/43] Experimental character-based pretraining (#5700) * Use cosine loss in Cloze multitask * Fix char_embed for gpu * Call resume_training for base model in train CLI * Fix bilstm_depth default in pretrain command * Implement character-based pretraining objective * Use chars loss in ClozeMultitask * Add method to decode predicted characters * Fix number characters * Rescale gradients for mlm * Fix char embed+vectors in ml * Fix pipes * Fix pretrain args * Move get_characters_loss * Fix import * Fix import * Mention characters loss option in pretrain * Remove broken 'self attention' option in pretrain * Revert "Remove broken 'self attention' option in pretrain" This reverts commit 56b820f6afaef14e2cab9a6ff9f5edc58f806554. * Document 'characters' objective of pretrain --- spacy/_ml.py | 20 ++++++++++++-- spacy/cli/pretrain.py | 38 ++++++++++++++++----------- spacy/cli/train.py | 2 +- spacy/ml/_legacy_tok2vec.py | 11 +++++++- spacy/pipeline/pipes.pyx | 52 +++++++++++++++++++++++++++---------- website/docs/api/cli.md | 2 +- 6 files changed, 92 insertions(+), 33 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 60a0bbee0..d947aab1c 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -14,7 +14,7 @@ from thinc.api import with_getitem, flatten_add_lengths from thinc.api import uniqued, wrap, noop from thinc.linear.linear import LinearModel from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module, copy_array +from thinc.neural.util import get_array_module, copy_array, to_categorical from thinc.neural.optimizers import Adam from thinc import describe @@ -840,6 +840,8 @@ def masked_language_model(vocab, model, mask_prob=0.15): def mlm_backward(d_output, sgd=None): d_output *= 1 - mask + # Rescale gradient for number of instances. + d_output *= mask.size - mask.sum() return backprop(d_output, sgd=sgd) return output, mlm_backward @@ -944,7 +946,7 @@ class CharacterEmbed(Model): # for the tip. nCv = self.ops.xp.arange(self.nC) for doc in docs: - doc_ids = doc.to_utf8_array(nr_char=self.nC) + doc_ids = self.ops.asarray(doc.to_utf8_array(nr_char=self.nC)) doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM)) # Let's say I have a 2d array of indices, and a 3d table of data. What numpy # incantation do I chant to get @@ -986,3 +988,17 @@ def get_cossim_loss(yh, y, ignore_zeros=False): losses[zero_indices] = 0 loss = losses.sum() return loss, -d_yh + + +def get_characters_loss(ops, docs, prediction, nr_char=10): + target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) + target_ids = target_ids.reshape((-1,)) + target = ops.asarray(to_categorical(target_ids, nb_classes=256), dtype="f") + target = target.reshape((-1, 256*nr_char)) + diff = prediction - target + loss = (diff**2).sum() + d_target = diff / float(prediction.shape[0]) + return loss, d_target + + + diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index aaec1ea75..6d6c65161 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -18,7 +18,8 @@ from ..errors import Errors from ..tokens import Doc from ..attrs import ID, HEAD from .._ml import Tok2Vec, flatten, chain, create_default_optimizer -from .._ml import masked_language_model, get_cossim_loss +from .._ml import masked_language_model, get_cossim_loss, get_characters_loss +from .._ml import MultiSoftmax from .. import util from .train import _load_pretrained_tok2vec @@ -42,7 +43,7 @@ from .train import _load_pretrained_tok2vec bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int), embed_rows=("Number of embedding rows", "option", "er", int), loss_func=( - "Loss function to use for the objective. Either 'L2' or 'cosine'", + "Loss function to use for the objective. Either 'characters', 'L2' or 'cosine'", "option", "L", str, @@ -85,11 +86,11 @@ def pretrain( output_dir, width=96, conv_depth=4, - bilstm_depth=0, cnn_pieces=3, sa_depth=0, - use_chars=False, cnn_window=1, + bilstm_depth=0, + use_chars=False, embed_rows=2000, loss_func="cosine", use_vectors=False, @@ -124,11 +125,7 @@ def pretrain( config[key] = str(config[key]) util.fix_random_seed(seed) - has_gpu = prefer_gpu() - if has_gpu: - import torch - - torch.set_default_tensor_type("torch.cuda.FloatTensor") + has_gpu = prefer_gpu(gpu_id=1) msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) @@ -174,6 +171,7 @@ def pretrain( subword_features=not use_chars, # Set to False for Chinese etc cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. ), + objective=loss_func ) # Load in pretrained weights if init_tok2vec is not None: @@ -264,7 +262,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"): RETURNS loss: A float for the loss. """ predictions, backprop = model.begin_update(docs, drop=drop) - loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective) + if objective == "characters": + loss, gradients = get_characters_loss(model.ops, docs, predictions) + else: + loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective) backprop(gradients, sgd=optimizer) # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, @@ -326,16 +327,23 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): return loss, d_target -def create_pretraining_model(nlp, tok2vec): +def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10): """Define a network for the pretraining. We simply add an output layer onto the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. """ - output_size = nlp.vocab.vectors.data.shape[1] - output_layer = chain( - LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0) - ) + if objective == "characters": + out_sizes = [256] * nr_char + output_layer = chain( + LN(Maxout(300, pieces=3)), + MultiSoftmax(out_sizes, 300) + ) + else: + output_size = nlp.vocab.vectors.data.shape[1] + output_layer = chain( + LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0) + ) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match # the shape of the models' components exactly. So what we cann diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d4de9aeb4..fc4c9f67b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -285,7 +285,7 @@ def train( if base_model and not pipes_added: # Start with an existing model, use default optimizer - optimizer = create_default_optimizer(Model.ops) + optimizer = nlp.resume_training(device=use_gpu) else: # Start with a blank model, call begin_training cfg = {"device": use_gpu} diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py index b077a46b7..3e41b1c6a 100644 --- a/spacy/ml/_legacy_tok2vec.py +++ b/spacy/ml/_legacy_tok2vec.py @@ -49,6 +49,14 @@ def Tok2Vec(width, embed_size, **kwargs): >> LN(Maxout(width, width * 5, pieces=3)), column=cols.index(ORTH), ) + elif char_embed: + embed = concatenate_lists( + CharacterEmbed(nM=64, nC=8), + FeatureExtracter(cols) >> with_flatten(glove), + ) + reduce_dimensions = LN( + Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces) + ) else: embed = uniqued( (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)), @@ -81,7 +89,8 @@ def Tok2Vec(width, embed_size, **kwargs): ) else: tok2vec = FeatureExtracter(cols) >> with_flatten( - embed >> convolution ** conv_depth, pad=conv_depth + embed + >> convolution ** conv_depth, pad=conv_depth ) if bilstm_depth >= 1: diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 8f07bf8f7..b28f34a7a 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -33,6 +33,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss +from .._ml import MultiSoftmax, get_characters_loss from ..errors import Errors, TempErrors, Warnings from .. import util @@ -846,11 +847,15 @@ class MultitaskObjective(Tagger): class ClozeMultitask(Pipe): @classmethod def Model(cls, vocab, tok2vec, **cfg): - output_size = vocab.vectors.data.shape[1] - output_layer = chain( - LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)), - zero_init(Affine(output_size, output_size, drop_factor=0.0)) - ) + if cfg["objective"] == "characters": + out_sizes = [256] * cfg.get("nr_char", 4) + output_layer = MultiSoftmax(out_sizes) + else: + output_size = vocab.vectors.data.shape[1] + output_layer = chain( + LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)), + zero_init(Affine(output_size, output_size, drop_factor=0.0)) + ) model = chain(tok2vec, output_layer) model = masked_language_model(vocab, model) model.tok2vec = tok2vec @@ -861,6 +866,8 @@ class ClozeMultitask(Pipe): self.vocab = vocab self.model = model self.cfg = cfg + self.cfg.setdefault("objective", "characters") + self.cfg.setdefault("nr_char", 4) def set_annotations(self, docs, dep_ids, tensors=None): pass @@ -869,7 +876,8 @@ class ClozeMultitask(Pipe): tok2vec=None, sgd=None, **kwargs): link_vectors_to_models(self.vocab) if self.model is True: - self.model = self.Model(self.vocab, tok2vec) + kwargs.update(self.cfg) + self.model = self.Model(self.vocab, tok2vec, **kwargs) X = self.model.ops.allocate((5, self.model.tok2vec.nO)) self.model.output_layer.begin_training(X) if sgd is None: @@ -883,13 +891,16 @@ class ClozeMultitask(Pipe): return tokvecs, vectors def get_loss(self, docs, vectors, prediction): - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) - target = vectors[ids] - loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) + if self.cfg["objective"] == "characters": + loss, gradient = get_characters_loss(self.model.ops, docs, prediction) + else: + # The simplest way to implement this would be to vstack the + # token.vector values, but that's a bit inefficient, especially on GPU. + # Instead we fetch the index into the vectors table for each of our tokens, + # and look them up all at once. This prevents data copying. + ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + target = vectors[ids] + loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) return float(loss), gradient def update(self, docs, golds, drop=0., sgd=None, losses=None): @@ -906,6 +917,20 @@ class ClozeMultitask(Pipe): if losses is not None: losses[self.name] += loss + @staticmethod + def decode_utf8_predictions(char_array): + # The format alternates filling from start and end, and 255 is missing + words = [] + char_array = char_array.reshape((char_array.shape[0], -1, 256)) + nr_char = char_array.shape[1] + char_array = char_array.argmax(axis=-1) + for row in char_array: + starts = [chr(c) for c in row[::2] if c != 255] + ends = [chr(c) for c in row[1::2] if c != 255] + word = "".join(starts + list(reversed(ends))) + words.append(word) + return words + @component("textcat", assigns=["doc.cats"]) class TextCategorizer(Pipe): @@ -1069,6 +1094,7 @@ cdef class DependencyParser(Parser): assigns = ["token.dep", "token.is_sent_start", "doc.sents"] requires = [] TransitionSystem = ArcEager + nr_feature = 8 @property def postprocesses(self): diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index fe8877c69..779fa7695 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -473,7 +473,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] | `--use-chars`, `-chr` 2.2.2 | flag | Whether to use character-based embedding. | | `--sa-depth`, `-sa` 2.2.2 | option | Depth of self-attention layers. | | `--embed-rows`, `-er` | option | Number of embedding rows. | -| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | +| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"cosine"`, `"L2"` or `"characters"`. | | `--dropout`, `-d` | option | Dropout rate. | | `--batch-size`, `-bs` | option | Number of words per training batch. | | `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | From 9860b8399ed2a3d1d680e1c1cd31d85926422709 Mon Sep 17 00:00:00 2001 From: graue70 <23035329+graue70@users.noreply.github.com> Date: Sun, 5 Jul 2020 15:49:06 +0200 Subject: [PATCH 26/43] Fix typo in test function docstring (#5696) --- spacy/tests/regression/test_issue2501-3000.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 1f5e44499..622fc3635 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -59,7 +59,7 @@ def test_issue2626_2835(en_tokenizer, text): def test_issue2656(en_tokenizer): - """Test that tokenizer correctly splits of punctuation after numbers with + """Test that tokenizer correctly splits off punctuation after numbers with decimal points. """ doc = en_tokenizer("I went for 40.3, and got home by 10.0.") From 7a2ca00794da43edc9c55e690b647d5b5b962e42 Mon Sep 17 00:00:00 2001 From: Mike Izbicki Date: Mon, 6 Jul 2020 08:03:33 -0700 Subject: [PATCH 27/43] fix bug in Korean language, resulting in 100x speedup by reducing overhead of mecab (#5701) * speed up Korean nlp 100x by stopping mecab from reloading on each doc * add contributor agreement * rename variables to improve code readability --- .github/contributors/mikeizbicki.md | 106 ++++++++++++++++++++++++++++ spacy/lang/ko/__init__.py | 27 +++---- 2 files changed, 121 insertions(+), 12 deletions(-) create mode 100644 .github/contributors/mikeizbicki.md diff --git a/.github/contributors/mikeizbicki.md b/.github/contributors/mikeizbicki.md new file mode 100644 index 000000000..6e9d8c098 --- /dev/null +++ b/.github/contributors/mikeizbicki.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Mike Izbicki | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 02 Jun 2020 | +| GitHub username | mikeizbicki | +| Website (optional) | https://izbicki.me | diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index ec79a95ab..21a754168 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -42,7 +42,11 @@ def check_spaces(text, tokens): class KoreanTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.Tokenizer = try_mecab_import() + MeCab = try_mecab_import() + self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") + + def __del__(self): + self.mecab_tokenizer.__del__() def __call__(self, text): dtokens = list(self.detailed_tokens(text)) @@ -58,17 +62,16 @@ class KoreanTokenizer(DummyTokenizer): def detailed_tokens(self, text): # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * - with self.Tokenizer("-F%f[0],%f[7]") as tokenizer: - for node in tokenizer.parse(text, as_nodes=True): - if node.is_eos(): - break - surface = node.surface - feature = node.feature - tag, _, expr = feature.partition(",") - lemma, _, remainder = expr.partition("/") - if lemma == "*": - lemma = surface - yield {"surface": surface, "lemma": lemma, "tag": tag} + for node in self.mecab_tokenizer.parse(text, as_nodes=True): + if node.is_eos(): + break + surface = node.surface + feature = node.feature + tag, _, expr = feature.partition(",") + lemma, _, remainder = expr.partition("/") + if lemma == "*": + lemma = surface + yield {"surface": surface, "lemma": lemma, "tag": tag} class KoreanDefaults(Language.Defaults): From 546f3d10d4ab2f6e2d7149d13087a41480335ddd Mon Sep 17 00:00:00 2001 From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com> Date: Tue, 7 Jul 2020 20:54:22 +0200 Subject: [PATCH 28/43] Add texthero to universe.json (#5716) * Add texthero to universe.json * Add spaCy contributor Agreement --- .github/contributors/jbesomi.md | 106 ++++++++++++++++++++++++++++++++ website/meta/universe.json | 26 ++++++++ 2 files changed, 132 insertions(+) create mode 100644 .github/contributors/jbesomi.md diff --git a/.github/contributors/jbesomi.md b/.github/contributors/jbesomi.md new file mode 100644 index 000000000..ac43a3bfd --- /dev/null +++ b/.github/contributors/jbesomi.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jonathan B. | +| Company name (if applicable) | besomi.ai | +| Title or role (if applicable) | - | +| Date | 07.07.2020 | +| GitHub username | jbesomi | +| Website (optional) | besomi.ai | diff --git a/website/meta/universe.json b/website/meta/universe.json index 2c74a2964..1d732f088 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2347,6 +2347,32 @@ }, "category": ["pipeline", "conversational", "research"], "tags": ["spell check", "correction", "preprocessing", "translation", "correction"] + }, + { + "id": "texthero", + "title": "Texthero", + "slogan": "Text preprocessing, representation and visualization from zero to hero.", + "description": "Texthero is a python package to work with text data efficiently. It empowers NLP developers with a tool to quickly understand any text-based dataset and it provides a solid pipeline to clean and represent text data, from zero to hero.", + "github": "jbesomi/texthero", + "pip": "texthero", + "code_example": [ + "import texthero as hero", + "import pandas as pd", + "", + "df = pd.read_csv('https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv')", + "df['named_entities'] = hero.named_entities(df['text'])", + "df.head()" + ], + "code_language": "python", + "url": "https://texthero.org", + "thumb": "https://texthero.org/img/T.png", + "image": "https://texthero.org/docs/assets/texthero.png", + "author": "Jonathan Besomi", + "author_links": { + "github": "jbesomi", + "website": "https://besomi.ai" + }, + "category": ["standalone"], } ], From 9097549227c56a34fd00b47957de12010fe57d53 Mon Sep 17 00:00:00 2001 From: gandersen101 Date: Tue, 7 Jul 2020 13:55:24 -0500 Subject: [PATCH 29/43] Adding spaczz package to universe.json (#5717) * Adding spaczz package to universe.json * Adding contributor agreement. --- .github/contributors/gandersen101.md | 106 +++++++++++++++++++++++++++ website/meta/universe.json | 29 ++++++++ 2 files changed, 135 insertions(+) create mode 100644 .github/contributors/gandersen101.md diff --git a/.github/contributors/gandersen101.md b/.github/contributors/gandersen101.md new file mode 100644 index 000000000..cae4ad047 --- /dev/null +++ b/.github/contributors/gandersen101.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Grant Andersen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 07.06.2020 | +| GitHub username | gandersen101 | +| Website (optional) | | diff --git a/website/meta/universe.json b/website/meta/universe.json index 1d732f088..e57f2bf70 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,34 @@ { "resources": [ + { + "id": "spaczz", + "title": "spaczz", + "slogan": "Fuzzy matching and more for spaCy.", + "description": "Spaczz provides fuzzy matching and multi-token regex matching functionality for spaCy. Spaczz's components have similar APIs to their spaCy counterparts and spaczz pipeline components can integrate into spaCy pipelines where they can be saved/loaded as models.", + "github": "gandersen101/spaczz", + "pip": "spaczz", + "code_example": [ + "import spacy", + "from spaczz.pipeline import SpaczzRuler", + "", + "nlp = spacy.blank('en')", + "ruler = SpaczzRuler(nlp)", + "ruler.add_patterns([{'label': 'PERSON', 'pattern': 'Bill Gates', 'type': 'fuzzy'}])", + "nlp.add_pipe(ruler)", + "", + "doc = nlp('Oops, I spelled Bill Gattes' name wrong.')", + "print([(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents])" + ], + "code_language": "python", + "url": "https://spaczz.readthedocs.io/en/latest/", + "author": "Grant Andersen", + "author_links": { + "twitter": "gandersen101", + "github": "gandersen101" + }, + "category": ["pipeline"], + "tags": ["fuzzy-matching", "regex"] + }, { "id": "spacy-universal-sentence-encoder", "title": "SpaCy - Universal Sentence Encoder", From 109849bd311490f17a29b320cb032e43d153f36f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 7 Jul 2020 21:12:28 +0200 Subject: [PATCH 30/43] Fix and update universe.json [ci skip] --- website/meta/universe.json | 53 +++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index e57f2bf70..2b6d82663 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,29 @@ { "resources": [ + { + "id": "spacy-streamlit", + "title": "spacy-streamlit", + "slogan": "spaCy building blocks for Streamlit apps", + "github": "explosion/spacy-streamlit", + "description": "This package contains utilities for visualizing spaCy models and building interactive spaCy-powered apps with [Streamlit](https://streamlit.io). It includes various building blocks you can use in your own Streamlit app, like visualizers for **syntactic dependencies**, **named entities**, **text classification**, **semantic similarity** via word vectors, token attributes, and more.", + "pip": "spacy-streamlit", + "category": ["visualizers"], + "thumb": "https://i.imgur.com/mhEjluE.jpg", + "image": "https://user-images.githubusercontent.com/13643239/85388081-f2da8700-b545-11ea-9bd4-e303d3c5763c.png", + "code_example": [ + "import spacy_streamlit", + "", + "models = [\"en_core_web_sm\", \"en_core_web_md\"]", + "default_text = \"Sundar Pichai is the CEO of Google.\"", + "spacy_streamlit.visualize(models, default_text))" + ], + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + } + }, { "id": "spaczz", "title": "spaczz", @@ -1266,6 +1290,19 @@ "youtube": "K1elwpgDdls", "category": ["videos"] }, + { + "type": "education", + "id": "video-spacy-course-es", + "title": "NLP avanzado con spaCy · Un curso en línea gratis", + "description": "spaCy es un paquete moderno de Python para hacer Procesamiento de Lenguaje Natural de potencia industrial. En este curso en línea, interactivo y gratuito, aprenderás a usar spaCy para construir sistemas avanzados de comprensión de lenguaje natural usando enfoques basados en reglas y en machine learning.", + "url": "https://course.spacy.io/es", + "author": "Camila Gutiérrez", + "author_links": { + "twitter": "Mariacamilagl30" + }, + "youtube": "RNiLVCE5d4k", + "category": ["videos"] + }, { "type": "education", "id": "video-intro-to-nlp-episode-1", @@ -1322,6 +1359,20 @@ "youtube": "IqOJU1-_Fi0", "category": ["videos"] }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-5", + "title": "Intro to NLP with spaCy (5)", + "slogan": "Episode 5: Rules vs. Machine Learning", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "f4sqeLRzkPg", + "category": ["videos"] + }, { "type": "education", "id": "video-spacy-irl-entity-linking", @@ -2401,7 +2452,7 @@ "github": "jbesomi", "website": "https://besomi.ai" }, - "category": ["standalone"], + "category": ["standalone"] } ], From 893133873d8ef906b37b2214fe34a1b4b94b2d3e Mon Sep 17 00:00:00 2001 From: gandersen101 Date: Tue, 7 Jul 2020 19:16:28 -0500 Subject: [PATCH 31/43] Fix quote issue in spaczz universe.json --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 2b6d82663..c5eb96e43 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -40,7 +40,7 @@ "ruler.add_patterns([{'label': 'PERSON', 'pattern': 'Bill Gates', 'type': 'fuzzy'}])", "nlp.add_pipe(ruler)", "", - "doc = nlp('Oops, I spelled Bill Gattes' name wrong.')", + "doc = nlp('Oops, I spelled Bill Gatez wrong.')", "print([(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents])" ], "code_language": "python", From 923affd091dac4c1a7e11168f7c8f1f05dcc224e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 9 Jul 2020 22:11:13 +0200 Subject: [PATCH 32/43] Remove is_base_form from French lemmatizer (#5733) Remove English-specific is_base_form from French lemmatizer. --- spacy/lang/fr/lemmatizer.py | 40 ------------------------------------- 1 file changed, 40 deletions(-) diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index 79f4dd28d..af8345e1b 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -45,9 +45,6 @@ class FrenchLemmatizer(Lemmatizer): univ_pos = "sconj" else: return [self.lookup(string)] - # See Issue #435 for example of where this logic is requied. - if self.is_base_form(univ_pos, morphology): - return list(set([string.lower()])) index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) @@ -59,43 +56,6 @@ class FrenchLemmatizer(Lemmatizer): ) return lemmas - def is_base_form(self, univ_pos, morphology=None): - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - """ - morphology = {} if morphology is None else morphology - others = [ - key - for key in morphology - if key not in (POS, "Number", "POS", "VerbForm", "Tense") - ] - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - and not others - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif VerbForm_inf in morphology: - return True - elif VerbForm_none in morphology: - return True - elif Number_sing in morphology: - return True - elif Degree_pos in morphology: - return True - else: - return False - def noun(self, string, morphology=None): return self(string, "noun", morphology) From 0a62098c5f0e0abe640a76776ddf6ea7094e2c23 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 9 Jul 2020 22:11:24 +0200 Subject: [PATCH 33/43] Fix lemmatizer is_base_form for python2.7 (#5734) * Fix lemmatizer init args for python2.7 * Move English is_base_form to a class method * Skip test pickling PhraseMatcher for python2 --- spacy/lang/en/__init__.py | 71 +++++++++---------- spacy/lemmatizer.py | 2 +- spacy/tests/regression/test_issue3001-3500.py | 1 + 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index d52f3dfd8..f58ae4a4e 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -18,41 +18,6 @@ def _return_en(_): return "en" -def en_is_base_form(univ_pos, morphology=None): - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - - univ_pos (unicode / int): The token's universal part-of-speech tag. - morphology (dict): The token's morphological features following the - Universal Dependencies scheme. - """ - if morphology is None: - morphology = {} - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif morphology.get("VerbForm") == "inf": - return True - elif morphology.get("VerbForm") == "none": - return True - elif morphology.get("Degree") == "pos": - return True - else: - return False - - class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) @@ -61,7 +26,6 @@ class EnglishDefaults(Language.Defaults): tag_map = TAG_MAP stop_words = STOP_WORDS morph_rules = MORPH_RULES - is_base_form = en_is_base_form syntax_iterators = SYNTAX_ITERATORS single_orth_variants = [ {"tags": ["NFP"], "variants": ["…", "..."]}, @@ -72,6 +36,41 @@ class EnglishDefaults(Language.Defaults): {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}, ] + @classmethod + def is_base_form(cls, univ_pos, morphology=None): + """ + Check whether we're dealing with an uninflected paradigm, so we can + avoid lemmatization entirely. + + univ_pos (unicode / int): The token's universal part-of-speech tag. + morphology (dict): The token's morphological features following the + Universal Dependencies scheme. + """ + if morphology is None: + morphology = {} + if univ_pos == "noun" and morphology.get("Number") == "sing": + return True + elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": + return True + # This maps 'VBP' to base form -- probably just need 'IS_BASE' + # morphology + elif univ_pos == "verb" and ( + morphology.get("VerbForm") == "fin" + and morphology.get("Tense") == "pres" + and morphology.get("Number") is None + ): + return True + elif univ_pos == "adj" and morphology.get("Degree") == "pos": + return True + elif morphology.get("VerbForm") == "inf": + return True + elif morphology.get("VerbForm") == "none": + return True + elif morphology.get("Degree") == "pos": + return True + else: + return False + class English(Language): lang = "en" diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index f72eae128..8b2375257 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -21,7 +21,7 @@ class Lemmatizer(object): def load(cls, *args, **kwargs): raise NotImplementedError(Errors.E172) - def __init__(self, lookups, *args, is_base_form=None, **kwargs): + def __init__(self, lookups, is_base_form=None, *args, **kwargs): """Initialize a Lemmatizer. lookups (Lookups): The lookups object containing the (optional) tables diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index effbebb92..a10225390 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -121,6 +121,7 @@ def test_issue3248_1(): assert len(matcher) == 2 +@pytest.mark.skipif(is_python2, reason="Can't pickle instancemethod for is_base_form") def test_issue3248_2(): """Test that the PhraseMatcher can be pickled correctly.""" nlp = English() From 27a1cd3c630055802513a20c1e75d0b37943cc39 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Sun, 12 Jul 2020 13:06:46 -0700 Subject: [PATCH 34/43] fix meta serialization in train (#5751) Co-authored-by: Mark Neumann --- spacy/cli/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index fc4c9f67b..b81214b95 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -576,6 +576,8 @@ def train( with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) + srsly.write_json(final_model_path / "meta.json", meta) + meta_loc = output_path / "model-final" / "meta.json" final_meta = srsly.read_json(meta_loc) final_meta.setdefault("accuracy", {}) From 7ea2cc76508cdcd6d854381f00f1da79309a0df3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 13 Jul 2020 14:55:56 +0200 Subject: [PATCH 35/43] Set version to 2.3.2 (#5756) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index cd97fa987..42c38cda5 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.3.1" +__version__ = "2.3.2" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 5228920e2fa3c1c067ec753ae40cfaea07908cfb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 14 Jul 2020 14:09:48 +0200 Subject: [PATCH 36/43] Clarify warning W030 for misaligned BILUO tags (#5761) --- spacy/errors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index a25661a20..ff71b60eb 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -113,8 +113,8 @@ class Warnings(object): W030 = ("Some entities could not be aligned in the text \"{text}\" with " "entities \"{entities}\". Use " "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" - " to check the alignment. Misaligned entities ('-') will be " - "ignored during training.") + " to check the alignment. Misaligned entities (with BILUO tag '-') " + "will be ignored during training.") W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and " "is incompatible with the current spaCy version ({current}). This " "may lead to unexpected results or runtime errors. To resolve " From 6f4e4aceb3710262ab376c79c5f740f72070b8e0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 18 Jul 2020 23:50:29 +0200 Subject: [PATCH 37/43] Add Plausible [ci skip] --- website/gatsby-config.js | 6 +++ website/meta/site.json | 1 + website/package-lock.json | 80 +++++++++++++++++++++++++++++++++------ website/package.json | 1 + 4 files changed, 76 insertions(+), 12 deletions(-) diff --git a/website/gatsby-config.js b/website/gatsby-config.js index aacc25545..2a5f957f4 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -140,6 +140,12 @@ module.exports = { respectDNT: true, }, }, + { + resolve: `gatsby-plugin-plausible`, + options: { + domain: site.domain, + }, + }, `gatsby-plugin-offline`, ], } diff --git a/website/meta/site.json b/website/meta/site.json index 8b8424f82..4d12a4c46 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -3,6 +3,7 @@ "description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.", "slogan": "Industrial-strength Natural Language Processing in Python", "siteUrl": "https://spacy.io", + "domain": "spacy.io", "email": "contact@explosion.ai", "company": "Explosion AI", "companyUrl": "https://explosion.ai", diff --git a/website/package-lock.json b/website/package-lock.json index cb1731c1b..dded33fb0 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -1424,6 +1424,7 @@ "version": "0.8.0", "resolved": "https://registry.npmjs.org/@sindresorhus/slugify/-/slugify-0.8.0.tgz", "integrity": "sha512-Y+C3aG0JHmi4nCfixHgq0iAtqWCjMCliWghf6fXbemRKSGzpcrHdYxGZGDt8MeFg+gH7ounfMbz6WogqKCWvDg==", + "dev": true, "requires": { "escape-string-regexp": "^1.0.5", "lodash.deburr": "^4.1.0" @@ -3570,7 +3571,8 @@ }, "ansi-regex": { "version": "2.1.1", - "bundled": true + "bundled": true, + "optional": true }, "aproba": { "version": "1.2.0", @@ -3588,11 +3590,13 @@ }, "balanced-match": { "version": "1.0.0", - "bundled": true + "bundled": true, + "optional": true }, "brace-expansion": { "version": "1.1.11", "bundled": true, + "optional": true, "requires": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -3605,15 +3609,18 @@ }, "code-point-at": { "version": "1.1.0", - "bundled": true + "bundled": true, + "optional": true }, "concat-map": { "version": "0.0.1", - "bundled": true + "bundled": true, + "optional": true }, "console-control-strings": { "version": "1.1.0", - "bundled": true + "bundled": true, + "optional": true }, "core-util-is": { "version": "1.0.2", @@ -3716,7 +3723,8 @@ }, "inherits": { "version": "2.0.3", - "bundled": true + "bundled": true, + "optional": true }, "ini": { "version": "1.3.5", @@ -3726,6 +3734,7 @@ "is-fullwidth-code-point": { "version": "1.0.0", "bundled": true, + "optional": true, "requires": { "number-is-nan": "^1.0.0" } @@ -3738,17 +3747,20 @@ "minimatch": { "version": "3.0.4", "bundled": true, + "optional": true, "requires": { "brace-expansion": "^1.1.7" } }, "minimist": { "version": "0.0.8", - "bundled": true + "bundled": true, + "optional": true }, "minipass": { "version": "2.3.5", "bundled": true, + "optional": true, "requires": { "safe-buffer": "^5.1.2", "yallist": "^3.0.0" @@ -3765,6 +3777,7 @@ "mkdirp": { "version": "0.5.1", "bundled": true, + "optional": true, "requires": { "minimist": "0.0.8" } @@ -3837,7 +3850,8 @@ }, "number-is-nan": { "version": "1.0.1", - "bundled": true + "bundled": true, + "optional": true }, "object-assign": { "version": "4.1.1", @@ -3847,6 +3861,7 @@ "once": { "version": "1.4.0", "bundled": true, + "optional": true, "requires": { "wrappy": "1" } @@ -3922,7 +3937,8 @@ }, "safe-buffer": { "version": "5.1.2", - "bundled": true + "bundled": true, + "optional": true }, "safer-buffer": { "version": "2.1.2", @@ -3952,6 +3968,7 @@ "string-width": { "version": "1.0.2", "bundled": true, + "optional": true, "requires": { "code-point-at": "^1.0.0", "is-fullwidth-code-point": "^1.0.0", @@ -3969,6 +3986,7 @@ "strip-ansi": { "version": "3.0.1", "bundled": true, + "optional": true, "requires": { "ansi-regex": "^2.0.0" } @@ -4007,11 +4025,13 @@ }, "wrappy": { "version": "1.0.2", - "bundled": true + "bundled": true, + "optional": true }, "yallist": { "version": "3.0.3", - "bundled": true + "bundled": true, + "optional": true } } }, @@ -7482,6 +7502,41 @@ "slash": "^1.0.0" } }, + "gatsby-plugin-plausible": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/gatsby-plugin-plausible/-/gatsby-plugin-plausible-0.0.6.tgz", + "integrity": "sha512-qUdPQ3haeX2DIywGZ2boMpmFAnSbWzqS9cG9/OO0mWLigA0sDLWwGkpHIAvrfepgbB9U/roLtXflctBwOIxtcQ==", + "requires": { + "@babel/runtime": "^7.9.2", + "minimatch": "3.0.4", + "react": "^16.13.1" + }, + "dependencies": { + "@babel/runtime": { + "version": "7.10.5", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.10.5.tgz", + "integrity": "sha512-otddXKhdNn7d0ptoFRHtMLa8LqDxLYwTjB4nYgM1yy5N6gU/MUf8zqyyLltCH3yAVitBzmwK4us+DD0l/MauAg==", + "requires": { + "regenerator-runtime": "^0.13.4" + } + }, + "react": { + "version": "16.13.1", + "resolved": "https://registry.npmjs.org/react/-/react-16.13.1.tgz", + "integrity": "sha512-YMZQQq32xHLX0bz5Mnibv1/LHb3Sqzngu7xstSM+vrkE5Kzr9xE0yMByK5kMoTK30YVJE61WfbxIFFvfeDKT1w==", + "requires": { + "loose-envify": "^1.1.0", + "object-assign": "^4.1.1", + "prop-types": "^15.6.2" + } + }, + "regenerator-runtime": { + "version": "0.13.5", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.5.tgz", + "integrity": "sha512-ZS5w8CpKFinUzOwW3c83oPeVXoNsrLsaCoLtJvAClH135j/R77RuymhiSErhm2lKcwSCIpmvIWSbDkIfAqKQlA==" + } + } + }, "gatsby-plugin-react-helmet": { "version": "3.0.6", "resolved": "https://registry.npmjs.org/gatsby-plugin-react-helmet/-/gatsby-plugin-react-helmet-3.0.6.tgz", @@ -10198,7 +10253,8 @@ "lodash.deburr": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/lodash.deburr/-/lodash.deburr-4.1.0.tgz", - "integrity": "sha1-3bG7s+8HRYwBd7oH3hRCLLAz/5s=" + "integrity": "sha1-3bG7s+8HRYwBd7oH3hRCLLAz/5s=", + "dev": true }, "lodash.defaults": { "version": "4.2.0", diff --git a/website/package.json b/website/package.json index f43b9a6a0..a59bc9bdc 100644 --- a/website/package.json +++ b/website/package.json @@ -23,6 +23,7 @@ "gatsby-plugin-google-analytics": "^2.0.14", "gatsby-plugin-manifest": "^2.0.17", "gatsby-plugin-offline": "^2.0.24", + "gatsby-plugin-plausible": "0.0.6", "gatsby-plugin-react-helmet": "^3.0.6", "gatsby-plugin-react-svg": "^2.0.0", "gatsby-plugin-sass": "^2.0.10", From cd5af72c9af469bd55bcb4bc27a94db61c448919 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 11:09:49 +0200 Subject: [PATCH 38/43] Update pkuseg version (#5774) * Update pkuseg version in Chinese tokenizer warnings * Update pkuseg version in `Makefile` * Remove warning about python3.8 wheels in docs --- Makefile | 4 ++-- spacy/lang/zh/__init__.py | 2 +- website/docs/usage/models.md | 12 ------------ 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 865bf44c5..6c0a59ba8 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core chmod a+rx $@ cp $@ dist/spacy.pex @@ -15,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse + $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 9d1cb71a7..9f8a82c10 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -16,7 +16,7 @@ from .tag_map import TAG_MAP from ... import util -_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python" +_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python" def try_jieba_import(use_jieba): diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index b11e6347a..cc65dad68 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -117,18 +117,6 @@ The Chinese language class supports three word segmentation options: better segmentation for Chinese OntoNotes and the new [Chinese models](/models/zh). - - -Note that [`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship -with pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can -install it from our fork and compile it locally: - -```bash -$ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip -``` - - - The `meta` argument of the `Chinese` language class supports the following From 7e142720962e11fd62396721d7826ff2406c336c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 11:10:11 +0200 Subject: [PATCH 39/43] Lower upper pin for cupy to 8.0.0 (#5773) --- setup.cfg | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/setup.cfg b/setup.cfg index e556ba19c..9bd45d45d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -61,21 +61,21 @@ install_requires = lookups = spacy_lookups_data>=0.3.2,<0.4.0 cuda = - cupy>=5.0.0b4,<9.0.0 + cupy>=5.0.0b4,<8.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<9.0.0 + cupy-cuda80>=5.0.0b4,<8.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<9.0.0 + cupy-cuda90>=5.0.0b4,<8.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<9.0.0 + cupy-cuda91>=5.0.0b4,<8.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<9.0.0 + cupy-cuda92>=5.0.0b4,<8.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<9.0.0 + cupy-cuda100>=5.0.0b4,<8.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<9.0.0 + cupy-cuda101>=5.0.0b4,<8.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<9.0.0 + cupy-cuda102>=5.0.0b4,<8.0.0 # Language tokenizers with external dependencies ja = sudachipy>=0.4.5 From 597bcc629e173dfd87422188dc76a2f1053a9bba Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 11:13:39 +0200 Subject: [PATCH 40/43] Improve tag map initialization and updating (#5768) * Improve tag map initialization and updating Generalize tag map initialization and updating so that a provided tag map can be loaded correctly in the CLI. * normalize provided tag map as necessary * use the same method for initializing and overwriting the tag map * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map. --- spacy/cli/debug_data.py | 4 ++-- spacy/cli/train.py | 4 ++-- spacy/morphology.pyx | 31 ++++++++++++++++--------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 7a4a093e2..22540c779 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -70,8 +70,8 @@ def debug_data( else: lang_cls = get_lang_class(lang) nlp = lang_cls() - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) msg.divider("Data format validation") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b81214b95..e24aa8a95 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -250,8 +250,8 @@ def train( pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a9bab38ed..18bba0124 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -150,6 +150,19 @@ cdef class Morphology: self.mem = Pool() self.strings = string_store self.tags = PreshMap() + self._feat_map = MorphologyClassMap(FEATURES) + self.load_tag_map(tag_map) + self.lemmatizer = lemmatizer + + self._cache = PreshMapArray(self.n_tags) + self.exc = {} + if exc is not None: + for (tag, orth), attrs in exc.items(): + attrs = _normalize_props(attrs) + self.add_special_case( + self.strings.as_string(tag), self.strings.as_string(orth), attrs) + + def load_tag_map(self, tag_map): # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. if '_SP' in tag_map: @@ -160,29 +173,17 @@ cdef class Morphology: self.strings.add('_SP') tag_map = dict(tag_map) tag_map['_SP'] = space_attrs - self.tag_names = tuple(sorted(tag_map.keys())) self.tag_map = {} - self.lemmatizer = lemmatizer - self.n_tags = len(tag_map) self.reverse_index = {} - self._feat_map = MorphologyClassMap(FEATURES) - self._load_from_tag_map(tag_map) - - self._cache = PreshMapArray(self.n_tags) - self.exc = {} - if exc is not None: - for (tag, orth), attrs in exc.items(): - attrs = _normalize_props(attrs) - self.add_special_case( - self.strings.as_string(tag), self.strings.as_string(orth), attrs) - - def _load_from_tag_map(self, tag_map): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) self.add({self._feat_map.id2feat[feat] for feat in attrs if feat in self._feat_map.id2feat}) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i + self.tag_names = tuple(sorted(self.tag_map.keys())) + self.n_tags = len(self.tag_map) + self._cache = PreshMapArray(self.n_tags) def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer, From a8978ca285fa7ebf0867f54723a6ba5569b1c156 Mon Sep 17 00:00:00 2001 From: Alec Chapman Date: Sun, 19 Jul 2020 06:35:31 -0500 Subject: [PATCH 41/43] Add VA COVID-19 NLP project to spaCy Universe (#5777) * Update universe.json Add cov-bsv to "resources" * Update universe.json * add contributor agreement --- .github/contributors/abchapman93.md | 106 ++++++++++++++++++++++++++++ website/meta/universe.json | 24 +++++++ 2 files changed, 130 insertions(+) create mode 100644 .github/contributors/abchapman93.md diff --git a/.github/contributors/abchapman93.md b/.github/contributors/abchapman93.md new file mode 100644 index 000000000..5af0cb873 --- /dev/null +++ b/.github/contributors/abchapman93.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alec Chapman | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 7/17/2020 | +| GitHub username | abchapman93 | +| Website (optional) | | diff --git a/website/meta/universe.json b/website/meta/universe.json index c5eb96e43..e832b511f 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2453,6 +2453,30 @@ "website": "https://besomi.ai" }, "category": ["standalone"] + }, + { + "id": "cov-bsv", + "title": "VA COVID-19 NLP BSV", + "slogan": "spaCy pipeline for COVID-19 surveillance.", + "github": "abchapman93/VA_COVID-19_NLP_BSV", + "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.", + "pip": "cov-bsv", + "code_example": [ + "import cov_bsv", + "", + "nlp = cov_bsv.load()", + "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'", + "", + "print(doc.ents)", + "print(doc._.cov_classification)", + "cov_bsv.visualize_doc(doc)" + ], + "category": ["pipeline", "standalone", "biomedical", "scientific"], + "tags": ["clinical", "epidemiology", "covid-19", "surveillance"], + "author": "Alec Chapman", + "author_links": { + "github": "abchapman93" + } } ], From e6967ca98a9f39d196031aff59e7a2dae1033641 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 20 Jul 2020 14:59:41 +0200 Subject: [PATCH 42/43] Revert cupy-cuda version update --- setup.cfg | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/setup.cfg b/setup.cfg index 53a359247..a1c881d10 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,21 +65,21 @@ console_scripts = lookups = spacy_lookups_data>=0.3.2,<0.4.0 cuda = - cupy>=5.0.0b4,<8.0.0 + cupy>=5.0.0b4,<9.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<8.0.0 + cupy-cuda80>=5.0.0b4,<9.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<8.0.0 + cupy-cuda90>=5.0.0b4,<9.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<8.0.0 + cupy-cuda91>=5.0.0b4,<9.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<8.0.0 + cupy-cuda92>=5.0.0b4,<9.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<8.0.0 + cupy-cuda100>=5.0.0b4,<9.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<8.0.0 + cupy-cuda101>=5.0.0b4,<9.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<8.0.0 + cupy-cuda102>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = sudachipy>=0.4.5 From d51db72e461261ddc74e70abd7e2f745610a4408 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 20 Jul 2020 15:01:36 +0200 Subject: [PATCH 43/43] Remove Python 2 marker --- spacy/tests/regression/test_issue3001-3500.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 4219da40f..ca4733f0d 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -121,7 +121,6 @@ def test_issue3248_1(): assert len(matcher) == 2 -@pytest.mark.skipif(is_python2, reason="Can't pickle instancemethod for is_base_form") def test_issue3248_2(): """Test that the PhraseMatcher can be pickled correctly.""" nlp = English()