From a741de7cf658ce9a90d7afe67c88face8fb658ad Mon Sep 17 00:00:00 2001 From: Olamilekan Wahab Date: Sat, 21 Dec 2019 05:11:50 -0800 Subject: [PATCH 01/11] Adding support for Yoruba Language (#4614) * Adding Support for Yoruba * test text * Updated test string. * Fixing encoding declaration. * Adding encoding to stop_words.py * Added contributor agreement and removed iranlowo. * Added removed test files and removed iranlowo to keep project bare. * Returned CONTRIBUTING.md to default state. * Added delted conftest entries * Tidy up and auto-format * Revert CONTRIBUTING.md Co-authored-by: Ines Montani --- .github/contributors/Olamyy.md | 106 +++++++++++++++++++++++++ spacy/lang/yo/__init__.py | 24 ++++++ spacy/lang/yo/examples.py | 26 +++++++ spacy/lang/yo/lex_attrs.py | 115 ++++++++++++++++++++++++++++ spacy/lang/yo/stop_words.py | 12 +++ spacy/tests/conftest.py | 6 ++ spacy/tests/lang/test_attrs.py | 2 +- spacy/tests/lang/test_initialize.py | 2 +- spacy/tests/lang/yo/__init__.py | 0 spacy/tests/lang/yo/test_text.py | 32 ++++++++ 10 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/Olamyy.md create mode 100644 spacy/lang/yo/__init__.py create mode 100644 spacy/lang/yo/examples.py create mode 100644 spacy/lang/yo/lex_attrs.py create mode 100644 spacy/lang/yo/stop_words.py create mode 100644 spacy/tests/lang/yo/__init__.py create mode 100644 spacy/tests/lang/yo/test_text.py diff --git a/.github/contributors/Olamyy.md b/.github/contributors/Olamyy.md new file mode 100644 index 000000000..711144825 --- /dev/null +++ b/.github/contributors/Olamyy.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ x ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Olamilekan Wahab | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 8/11/2019 | +| GitHub username | Olamyy | +| Website (optional) | | diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py new file mode 100644 index 000000000..f227203cc --- /dev/null +++ b/spacy/lang/yo/__init__.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG + + +class YorubaDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "yo" + stop_words = STOP_WORDS + tokenizer_exceptions = BASE_EXCEPTIONS + + +class Yoruba(Language): + lang = "yo" + Defaults = YorubaDefaults + + +__all__ = ["Yoruba"] diff --git a/spacy/lang/yo/examples.py b/spacy/lang/yo/examples.py new file mode 100644 index 000000000..170ddc803 --- /dev/null +++ b/spacy/lang/yo/examples.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.yo.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +# 1. https://yo.wikipedia.org/wiki/Wikipedia:%C3%80y%E1%BB%8Dk%C3%A0_p%C3%A0t%C3%A0k%C3%AC +# 2.https://yo.wikipedia.org/wiki/Oj%C3%BAew%C3%A9_%C3%80k%E1%BB%8D%CC%81k%E1%BB%8D%CC%81 +# 3. https://www.bbc.com/yoruba + +sentences = [ + "Ìjọba Tanzania fi Ajìjàgbara Ọmọ Orílẹ̀-èdèe Uganda sí àtìmọ́lé", + "Olúṣẹ́gun Ọbásanjọ́, tí ó jẹ́ Ààrẹ ìjọba ológun àná (láti ọdún 1976 sí 1979), tí ó sì tún ṣe Ààrẹ ìjọba alágbádá tí ìbò gbé wọlé (ní ọdún 1999 sí 2007), kúndùn láti máa bu ẹnu àtẹ́ lu àwọn " + "ètò ìjọba Ààrẹ orílẹ̀-èdè Nàìjíríà tí ó jẹ tẹ̀lé e.", + "Akin Alabi rán ẹnu mọ́ agbárá Adárí Òsìsẹ̀, àwọn ọmọ Nàìjíríà dẹnu bò ó", + "Ta ló leè dúró s'ẹ́gbẹ̀ẹ́ Okunnu láì rẹ́rìín?", + "Dídarapọ̀ mọ́n ìpolongo", + "Bi a se n so, omobinrin ni oruko ni ojo kejo bee naa ni omokunrin ni oruko ni ojo kesan.", + "Oríṣìíríṣìí nǹkan ló le yọrí sí orúkọ tí a sọ ọmọ", + "Gbogbo won ni won ni oriki ti won", +] diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py new file mode 100644 index 000000000..a9f1b85f6 --- /dev/null +++ b/spacy/lang/yo/lex_attrs.py @@ -0,0 +1,115 @@ +# coding: utf8 +from __future__ import unicode_literals + +import unicodedata + +from ...attrs import LIKE_NUM + + +_num_words = [ + "ení", + "oókàn", + "ọ̀kanlá", + "ẹ́ẹdọ́gbọ̀n", + "àádọ́fà", + "ẹ̀walélúɡba", + "egbèje", + "ẹgbàárin", + "èjì", + "eéjì", + "èjìlá", + "ọgbọ̀n,", + "ọgọ́fà", + "ọ̀ọ́dúrún", + "ẹgbẹ̀jọ", + "ẹ̀ẹ́dẹ́ɡbàárùn", + "ẹ̀ta", + "ẹẹ́ta", + "ẹ̀talá", + "aárùndílogójì", + "àádóje", + "irinwó", + "ẹgbẹ̀sàn", + "ẹgbàárùn", + "ẹ̀rin", + "ẹẹ́rin", + "ẹ̀rinlá", + "ogójì", + "ogóje", + "ẹ̀ẹ́dẹ́gbẹ̀ta", + "ẹgbàá", + "ẹgbàájọ", + "àrún", + "aárùn", + "ẹ́ẹdógún", + "àádọ́ta", + "àádọ́jọ", + "ẹgbẹ̀ta", + "ẹgboókànlá", + "ẹgbàawǎ", + "ẹ̀fà", + "ẹẹ́fà", + "ẹẹ́rìndílógún", + "ọgọ́ta", + "ọgọ́jọ", + "ọ̀ọ́dẹ́gbẹ̀rin", + "ẹgbẹ́ẹdógún", + "ọkẹ́marun", + "èje", + "etàdílógún", + "àádọ́rin", + "àádọ́sán", + "ẹgbẹ̀rin", + "ẹgbàajì", + "ẹgbẹ̀ẹgbẹ̀rún", + "ẹ̀jọ", + "ẹẹ́jọ", + "eéjìdílógún", + "ọgọ́rin", + "ọgọsàn", + "ẹ̀ẹ́dẹ́gbẹ̀rún", + "ẹgbẹ́ẹdọ́gbọ̀n", + "ọgọ́rùn ọkẹ́", + "ẹ̀sán", + "ẹẹ́sàn", + "oókàndílógún", + "àádọ́rùn", + "ẹ̀wadilúɡba", + "ẹgbẹ̀rún", + "ẹgbàáta", + "ẹ̀wá", + "ẹẹ́wàá", + "ogún", + "ọgọ́rùn", + "igba", + "ẹgbẹ̀fà", + "ẹ̀ẹ́dẹ́ɡbarin", +] + + +def strip_accents_text(text): + """ + Converts the string to NFD, separates & returns only the base characters + :param text: + :return: input string without diacritic adornments on base characters + """ + return "".join( + c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn" + ) + + +def like_num(text): + text = text.replace(",", "").replace(".", "") + num_markers = ["dí", "dọ", "lé", "dín", "di", "din", "le", "do"] + if any(mark in text for mark in num_markers): + return True + text = strip_accents_text(text) + _num_words_stripped = [strip_accents_text(num) for num in _num_words] + if text.isdigit(): + return True + if text in _num_words_stripped or text.lower() in _num_words_stripped: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/yo/stop_words.py b/spacy/lang/yo/stop_words.py new file mode 100644 index 000000000..53d382ad3 --- /dev/null +++ b/spacy/lang/yo/stop_words.py @@ -0,0 +1,12 @@ +# coding: utf8 +from __future__ import unicode_literals + +# stop words as whitespace-separated list. +# Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt + +STOP_WORDS = set( + "a an b bá bí bẹ̀rẹ̀ d e f fún fẹ́ g gbogbo i inú j jù jẹ jẹ́ k kan kì kí kò " + "l láti lè lọ m mi mo máa mọ̀ n ni náà ní nígbà nítorí nǹkan o p padà pé " + "púpọ̀ pẹ̀lú r rẹ̀ s sì sí sínú t ti tí u w wà wá wọn wọ́n y yìí à àti àwọn á " + "è é ì í ò òun ó ù ú ń ńlá ǹ ̀ ́ ̣ ṣ ṣe ṣé ṣùgbọ́n ẹ ẹmọ́ ọ ọjọ́ ọ̀pọ̀lọpọ̀".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 959a6b670..1a33221c2 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -219,8 +219,14 @@ def uk_tokenizer(): def ur_tokenizer(): return get_lang_class("ur").Defaults.create_tokenizer() + +@pytest.fixture(scope="session") +def yo_tokenizer(): + return get_lang_class("yo").Defaults.create_tokenizer() + @pytest.fixture(scope="session") def zh_tokenizer(): pytest.importorskip("jieba") return get_lang_class("zh").Defaults.create_tokenizer() + diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 4bb5aac70..ff630f0fa 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -87,4 +87,4 @@ def test_lex_attrs_like_url(text, match): ], ) def test_lex_attrs_word_shape(text, shape): - assert word_shape(text) == shape + assert word_shape(text) == shape \ No newline at end of file diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 9b01340e3..5c701fc22 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -11,7 +11,7 @@ from spacy.util import get_lang_class LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", - "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"] + "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo'] # fmt: on diff --git a/spacy/tests/lang/yo/__init__.py b/spacy/tests/lang/yo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/yo/test_text.py b/spacy/tests/lang/yo/test_text.py new file mode 100644 index 000000000..ce6408b67 --- /dev/null +++ b/spacy/tests/lang/yo/test_text.py @@ -0,0 +1,32 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.yo.lex_attrs import like_num + + +def test_yo_tokenizer_handles_long_text(yo_tokenizer): + text = """Àwọn ọmọ ìlú tí wọ́n ń ṣàmúlò ayélujára ti bẹ̀rẹ̀ ìkọkúkọ sórí àwòrán ààrẹ Nkurunziza nínú ìfẹ̀hónúhàn pẹ̀lú àmì ìdámọ̀: Nkurunziza àti Burundi: + Ọmọ ilé ẹ̀kọ́ gíga ní ẹ̀wọ̀n fún kíkọ ìkọkúkọ sí orí àwòrán Ààrẹ . + Bí mo bá ṣe èyí ní Burundi , ó ṣe é ṣe kí a fi mí sí àtìmọ́lé + Ìjọba Burundi fi akẹ́kọ̀ọ́bìnrin sí àtìmọ́lé látàrí ẹ̀sùn ìkọkúkọ sí orí àwòrán ààrẹ. A túwíìtì àwòrán ìkọkúkọ wa ní ìbánikẹ́dùn ìṣẹ̀lẹ̀ náà. + Wọ́n ní kí a dán an wò, kí a kọ nǹkan sí orí àwòrán ààrẹ mo sì ṣe bẹ́ẹ̀. Mo ní ìgbóyà wípé ẹnikẹ́ni kò ní mú mi níbí. + Ìfòfinlíle mú àtakò""" + tokens = yo_tokenizer(text) + assert len(tokens) == 121 + + +@pytest.mark.parametrize( + "text,match", + [("ení", True), ("ogun", True), ("mewadinlogun", True), ("ten", False)], +) +def test_lex_attrs_like_number(yo_tokenizer, text, match): + tokens = yo_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize("word", ["eji", "ejila", "ogun", "aárùn"]) +def test_yo_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) From cb4145adc7fb9ed7652d0dcc83e0e6770e679235 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 21 Dec 2019 19:04:17 +0100 Subject: [PATCH 02/11] Tidy up and auto-format --- spacy/lang/el/tag_map.py | 32 +-- spacy/lang/ja/__init__.py | 10 +- spacy/lang/lb/punctuation.py | 3 +- spacy/lang/lb/tokenizer_exceptions.py | 2 +- spacy/lang/nb/tag_map.py | 332 ++++++++++++++++++----- spacy/pipeline/entityruler.py | 5 +- spacy/tests/conftest.py | 5 +- spacy/tests/lang/fi/test_tokenizer.py | 2 +- spacy/tests/lang/lb/test_exceptions.py | 4 + spacy/tests/lang/lb/test_text.py | 2 +- spacy/tests/lang/test_attrs.py | 2 +- spacy/tests/parser/test_parse.py | 10 +- spacy/tests/pipeline/test_tagger.py | 1 - spacy/tests/regression/test_issue4674.py | 5 +- 14 files changed, 308 insertions(+), 107 deletions(-) diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map.py index 30816dbe4..b346299bc 100644 --- a/spacy/lang/el/tag_map.py +++ b/spacy/lang/el/tag_map.py @@ -4249,20 +4249,20 @@ TAG_MAP = { "Voice": "Act", "Case": "Nom|Gen|Dat|Acc|Voc", }, - 'ADJ': {POS: ADJ}, - 'ADP': {POS: ADP}, - 'ADV': {POS: ADV}, - 'AtDf': {POS: DET}, - 'AUX': {POS: AUX}, - 'CCONJ': {POS: CCONJ}, - 'DET': {POS: DET}, - 'NOUN': {POS: NOUN}, - 'NUM': {POS: NUM}, - 'PART': {POS: PART}, - 'PRON': {POS: PRON}, - 'PROPN': {POS: PROPN}, - 'SCONJ': {POS: SCONJ}, - 'SYM': {POS: SYM}, - 'VERB': {POS: VERB}, - 'X': {POS: X}, + "ADJ": {POS: ADJ}, + "ADP": {POS: ADP}, + "ADV": {POS: ADV}, + "AtDf": {POS: DET}, + "AUX": {POS: AUX}, + "CCONJ": {POS: CCONJ}, + "DET": {POS: DET}, + "NOUN": {POS: NOUN}, + "NUM": {POS: NUM}, + "PART": {POS: PART}, + "PRON": {POS: PRON}, + "PROPN": {POS: PROPN}, + "SCONJ": {POS: SCONJ}, + "SYM": {POS: SYM}, + "VERB": {POS: VERB}, + "X": {POS: X}, } diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 0538461a3..22590043f 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -16,7 +16,8 @@ from ...util import DummyTokenizer # the flow by creating a dummy with the same interface. DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) -DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' ')) +DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" ")) + def try_fugashi_import(): """Fugashi is required for Japanese support, so check for it. @@ -27,8 +28,7 @@ def try_fugashi_import(): return fugashi except ImportError: raise ImportError( - "Japanese support requires Fugashi: " - "https://github.com/polm/fugashi" + "Japanese support requires Fugashi: " "https://github.com/polm/fugashi" ) @@ -55,13 +55,14 @@ def resolve_pos(token): return token.pos + ",ADJ" return token.pos + def get_words_and_spaces(tokenizer, text): """Get the individual tokens that make up the sentence and handle white space. Japanese doesn't usually use white space, and MeCab's handling of it for multiple spaces in a row is somewhat awkward. """ - + tokens = tokenizer.parseToNodeList(text) words = [] @@ -76,6 +77,7 @@ def get_words_and_spaces(tokenizer, text): spaces.append(bool(token.white_space)) return words, spaces + class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py index 68531d9d0..1571e13d7 100644 --- a/spacy/lang/lb/punctuation.py +++ b/spacy/lang/lb/punctuation.py @@ -1,8 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS -from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(" ", "") diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index 8a35b6fb7..b32daa58c 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -20,7 +20,7 @@ for exc_data in [ {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, - {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"} + {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/nb/tag_map.py b/spacy/lang/nb/tag_map.py index cf4c95840..ca0ece265 100644 --- a/spacy/lang/nb/tag_map.py +++ b/spacy/lang/nb/tag_map.py @@ -467,38 +467,110 @@ TAG_MAP = { "VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB}, "VERB___": {"morph": "_", POS: VERB}, "X___": {"morph": "_", POS: X}, - 'CCONJ___': {"morph": "_", POS: CCONJ}, + "CCONJ___": {"morph": "_", POS: CCONJ}, "ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ}, "ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ}, - "ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, - "ADJ__Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, - "ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", POS: ADJ}, - "ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", POS: ADJ}, - "ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: ADJ}, + "ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": { + "morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", + POS: ADJ, + }, + "ADJ__Definite=Def|Number=Sing|VerbForm=Part": { + "morph": "Definite=Def|Number=Sing|VerbForm=Part", + POS: ADJ, + }, + "ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": { + "morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", + POS: ADJ, + }, + "ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": { + "morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", + POS: ADJ, + }, + "ADJ__Definite=Ind|Number=Sing|VerbForm=Part": { + "morph": "Definite=Ind|Number=Sing|VerbForm=Part", + POS: ADJ, + }, "ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ}, "ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ}, "ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP}, "ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV}, - "DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", POS: DET}, - "DET__Case=Gen|Number=Plur|PronType=Tot": {"morph": "Case=Gen|Number=Plur|PronType=Tot", POS: DET}, + "DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": { + "morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", + POS: DET, + }, + "DET__Case=Gen|Number=Plur|PronType=Tot": { + "morph": "Case=Gen|Number=Plur|PronType=Tot", + POS: DET, + }, "DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET}, - "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, - "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", POS: DET}, - "DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", POS: DET}, - "DET__Gender=Fem|Number=Sing|PronType=Art": {"morph": "Gender=Fem|Number=Sing|PronType=Art", POS: DET}, - "DET__Gender=Fem|Number=Sing|PronType=Ind": {"morph": "Gender=Fem|Number=Sing|PronType=Ind", POS: DET}, - "DET__Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, - "DET__Gender=Fem|Number=Sing|PronType=Tot": {"morph": "Gender=Fem|Number=Sing|PronType=Tot", POS: DET}, - "DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, - "DET__Gender=Masc|Number=Sing|PronType=Art": {"morph": "Gender=Masc|Number=Sing|PronType=Art", POS: DET}, - "DET__Gender=Masc|Number=Sing|PronType=Ind": {"morph": "Gender=Masc|Number=Sing|PronType=Ind", POS: DET}, - "DET__Gender=Masc|Number=Sing|PronType=Tot": {"morph": "Gender=Masc|Number=Sing|PronType=Tot", POS: DET}, - "DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, - "DET__Gender=Neut|Number=Sing|PronType=Art": {"morph": "Gender=Neut|Number=Sing|PronType=Art", POS: DET}, - "DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", POS: DET}, - "DET__Gender=Neut|Number=Sing|PronType=Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Ind", POS: DET}, - "DET__Gender=Neut|Number=Sing|PronType=Tot": {"morph": "Gender=Neut|Number=Sing|PronType=Tot", POS: DET}, - "DET__Number=Plur|Polarity=Neg|PronType=Neg": {"morph": "Number=Plur|Polarity=Neg|PronType=Neg", POS: DET}, + "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": { + "morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", + POS: DET, + }, + "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": { + "morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", + POS: DET, + }, + "DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": { + "morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", + POS: DET, + }, + "DET__Gender=Fem|Number=Sing|PronType=Art": { + "morph": "Gender=Fem|Number=Sing|PronType=Art", + POS: DET, + }, + "DET__Gender=Fem|Number=Sing|PronType=Ind": { + "morph": "Gender=Fem|Number=Sing|PronType=Ind", + POS: DET, + }, + "DET__Gender=Fem|Number=Sing|PronType=Prs": { + "morph": "Gender=Fem|Number=Sing|PronType=Prs", + POS: DET, + }, + "DET__Gender=Fem|Number=Sing|PronType=Tot": { + "morph": "Gender=Fem|Number=Sing|PronType=Tot", + POS: DET, + }, + "DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": { + "morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", + POS: DET, + }, + "DET__Gender=Masc|Number=Sing|PronType=Art": { + "morph": "Gender=Masc|Number=Sing|PronType=Art", + POS: DET, + }, + "DET__Gender=Masc|Number=Sing|PronType=Ind": { + "morph": "Gender=Masc|Number=Sing|PronType=Ind", + POS: DET, + }, + "DET__Gender=Masc|Number=Sing|PronType=Tot": { + "morph": "Gender=Masc|Number=Sing|PronType=Tot", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": { + "morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|PronType=Art": { + "morph": "Gender=Neut|Number=Sing|PronType=Art", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": { + "morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|PronType=Ind": { + "morph": "Gender=Neut|Number=Sing|PronType=Ind", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|PronType=Tot": { + "morph": "Gender=Neut|Number=Sing|PronType=Tot", + POS: DET, + }, + "DET__Number=Plur|Polarity=Neg|PronType=Neg": { + "morph": "Number=Plur|Polarity=Neg|PronType=Neg", + POS: DET, + }, "DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET}, "DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET}, "DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET}, @@ -507,57 +579,183 @@ TAG_MAP = { "DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET}, "NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN}, "NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN}, - "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", POS: NOUN}, - "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", POS: NOUN}, - "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", POS: NOUN}, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": { + "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", + POS: NOUN, + }, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": { + "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", + POS: NOUN, + }, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": { + "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", + POS: NOUN, + }, "NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN}, - "NUM__Case=Gen|Number=Plur|NumType=Card": {"morph": "Case=Gen|Number=Plur|NumType=Card", POS: NUM}, - "NUM__Definite=Def|Number=Sing|NumType=Card": {"morph": "Definite=Def|Number=Sing|NumType=Card", POS: NUM}, + "NUM__Case=Gen|Number=Plur|NumType=Card": { + "morph": "Case=Gen|Number=Plur|NumType=Card", + POS: NUM, + }, + "NUM__Definite=Def|Number=Sing|NumType=Card": { + "morph": "Definite=Def|Number=Sing|NumType=Card", + POS: NUM, + }, "NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM}, - "NUM__Gender=Fem|Number=Sing|NumType=Card": {"morph": "Gender=Fem|Number=Sing|NumType=Card", POS: NUM}, - "NUM__Gender=Masc|Number=Sing|NumType=Card": {"morph": "Gender=Masc|Number=Sing|NumType=Card", POS: NUM}, - "NUM__Gender=Neut|Number=Sing|NumType=Card": {"morph": "Gender=Neut|Number=Sing|NumType=Card", POS: NUM}, + "NUM__Gender=Fem|Number=Sing|NumType=Card": { + "morph": "Gender=Fem|Number=Sing|NumType=Card", + POS: NUM, + }, + "NUM__Gender=Masc|Number=Sing|NumType=Card": { + "morph": "Gender=Masc|Number=Sing|NumType=Card", + POS: NUM, + }, + "NUM__Gender=Neut|Number=Sing|NumType=Card": { + "morph": "Gender=Neut|Number=Sing|NumType=Card", + POS: NUM, + }, "NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM}, "NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM}, "NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM}, "PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART}, - "PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {"morph": "Animacy=Hum|Number=Plur|PronType=Rcp", POS: PRON}, - "PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", POS: PRON}, - "PRON__Animacy=Hum|Poss=Yes|PronType=Int": {"morph": "Animacy=Hum|Poss=Yes|PronType=Int", POS: PRON}, + "PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": { + "morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": { + "morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Number=Plur|PronType=Rcp": { + "morph": "Animacy=Hum|Number=Plur|PronType=Rcp", + POS: PRON, + }, + "PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": { + "morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Poss=Yes|PronType=Int": { + "morph": "Animacy=Hum|Poss=Yes|PronType=Int", + POS: PRON, + }, "PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON}, - "PRON__Case=Acc|PronType=Prs|Reflex=Yes": {"morph": "Case=Acc|PronType=Prs|Reflex=Yes", POS: PRON}, - "PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, - "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, - "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", POS: PRON}, - "PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, - "PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, - "PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, - "PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, - "PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, - "PRON__Number=Plur|Person=3|PronType=Ind,Prs": {"morph": "Number=Plur|Person=3|PronType=Ind,Prs", POS: PRON}, - "PRON__Number=Plur|Person=3|PronType=Prs,Tot": {"morph": "Number=Plur|Person=3|PronType=Prs,Tot", POS: PRON}, - "PRON__Number=Plur|Poss=Yes|PronType=Prs": {"morph": "Number=Plur|Poss=Yes|PronType=Prs", POS: PRON}, - "PRON__Number=Plur|Poss=Yes|PronType=Rcp": {"morph": "Number=Plur|Poss=Yes|PronType=Rcp", POS: PRON}, - "PRON__Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Number=Sing|Polarity=Neg|PronType=Neg", POS: PRON}, + "PRON__Case=Acc|PronType=Prs|Reflex=Yes": { + "morph": "Case=Acc|PronType=Prs|Reflex=Yes", + POS: PRON, + }, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { + "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", + POS: PRON, + }, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": { + "morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", + POS: PRON, + }, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": { + "morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", + POS: PRON, + }, + "PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": { + "morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", + POS: PRON, + }, + "PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": { + "morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", + POS: PRON, + }, + "PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": { + "morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", + POS: PRON, + }, + "PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": { + "morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", + POS: PRON, + }, + "PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": { + "morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", + POS: PRON, + }, + "PRON__Number=Plur|Person=3|PronType=Ind,Prs": { + "morph": "Number=Plur|Person=3|PronType=Ind,Prs", + POS: PRON, + }, + "PRON__Number=Plur|Person=3|PronType=Prs,Tot": { + "morph": "Number=Plur|Person=3|PronType=Prs,Tot", + POS: PRON, + }, + "PRON__Number=Plur|Poss=Yes|PronType=Prs": { + "morph": "Number=Plur|Poss=Yes|PronType=Prs", + POS: PRON, + }, + "PRON__Number=Plur|Poss=Yes|PronType=Rcp": { + "morph": "Number=Plur|Poss=Yes|PronType=Rcp", + POS: PRON, + }, + "PRON__Number=Sing|Polarity=Neg|PronType=Neg": { + "morph": "Number=Sing|Polarity=Neg|PronType=Neg", + POS: PRON, + }, "PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON}, "PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON}, "PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN}, "PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN}, - "VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", POS: VERB}, - "VERB__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: VERB}, + "VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": { + "morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", + POS: VERB, + }, + "VERB__Definite=Ind|Number=Sing|VerbForm=Part": { + "morph": "Definite=Ind|Number=Sing|VerbForm=Part", + POS: VERB, + }, } diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 205697637..2db312d64 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -295,10 +295,9 @@ class EntityRuler(object): deserializers_patterns = { "patterns": lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix(".jsonl")) - )} - deserializers_cfg = { - "cfg": lambda p: cfg.update(srsly.read_json(p)) + ) } + deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))} from_disk(path, deserializers_cfg, {}) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 1a33221c2..816970e61 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -219,14 +219,13 @@ def uk_tokenizer(): def ur_tokenizer(): return get_lang_class("ur").Defaults.create_tokenizer() - + @pytest.fixture(scope="session") def yo_tokenizer(): return get_lang_class("yo").Defaults.create_tokenizer() - + @pytest.fixture(scope="session") def zh_tokenizer(): pytest.importorskip("jieba") return get_lang_class("zh").Defaults.create_tokenizer() - diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index cbbebcf28..17f6f0ccc 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -15,7 +15,7 @@ ABBREVIATION_TESTS = [ HYPHENATED_TESTS = [ ( "1700-luvulle sijoittuva taide-elokuva", - ["1700-luvulle", "sijoittuva", "taide-elokuva"] + ["1700-luvulle", "sijoittuva", "taide-elokuva"], ) ] diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index 57541fc26..7ca2394b7 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -3,16 +3,19 @@ from __future__ import unicode_literals import pytest + @pytest.mark.parametrize("text", ["z.B.", "Jan."]) def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): tokens = lb_tokenizer(text) assert len(tokens) == 1 + @pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"]) def test_lb_tokenizer_splits_contractions(lb_tokenizer, text): tokens = lb_tokenizer(text) assert len(tokens) == 2 + def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): text = "Mee 't ass net evident, d'Liewen." tokens = lb_tokenizer(text) @@ -20,6 +23,7 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): assert tokens[1].text == "'t" assert tokens[1].lemma_ == "et" + @pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")]) def test_lb_norm_exceptions(lb_tokenizer, text, norm): tokens = lb_tokenizer(text) diff --git a/spacy/tests/lang/lb/test_text.py b/spacy/tests/lang/lb/test_text.py index 2284ff794..36464b379 100644 --- a/spacy/tests/lang/lb/test_text.py +++ b/spacy/tests/lang/lb/test_text.py @@ -16,7 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer): [ ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13), ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15), - ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14) + ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14), ], ) def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length): diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index ff630f0fa..4bb5aac70 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -87,4 +87,4 @@ def test_lex_attrs_like_url(text, match): ], ) def test_lex_attrs_word_shape(text, shape): - assert word_shape(text) == shape \ No newline at end of file + assert word_shape(text) == shape diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 384f14dad..fb5301718 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -151,17 +151,17 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): def test_parser_set_sent_starts(en_vocab): + # fmt: off words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1] deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', ''] - doc = get_doc( - en_vocab, words=words, deps=deps, heads=heads - ) + # fmt: on + doc = get_doc(en_vocab, words=words, deps=deps, heads=heads) for i in range(len(words)): if i == 0 or i == 3: - assert doc[i].is_sent_start == True + assert doc[i].is_sent_start is True else: - assert doc[i].is_sent_start == None + assert doc[i].is_sent_start is None for sent in doc.sents: for token in sent: assert token.head in sent diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index d0331602c..a5bda9090 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import pytest from spacy.language import Language -from spacy.pipeline import Tagger def test_label_types(): diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 36e9f02c1..5f8d1573f 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -15,7 +15,9 @@ def test_issue4674(): vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] - kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) + kb.set_entities( + entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2] + ) assert kb.get_size_entities() == 1 @@ -31,4 +33,3 @@ def test_issue4674(): kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1 - From 732142bf2825be61824d453009cc0cea130c3b4b Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 21 Dec 2019 21:12:19 +0100 Subject: [PATCH 03/11] facilitate larger training files (#4827) * add warning for large file and change start var to long * type for file_length --- spacy/errors.py | 4 ++++ spacy/gold.pyx | 10 +++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index dd2b38eb9..ce35d706c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -105,6 +105,10 @@ class Warnings(object): W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " "previous components in the pipeline declare that they assign it.") W026 = ("Unable to set all sentence boundaries from dependency parses.") + W027 = ("Found a large training file of {size} bytes. Note that it may " + "be more efficient to split your training data into multiple " + "smaller JSON files instead.") + @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1a74d2206..1d7f80c92 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -13,7 +13,7 @@ import srsly from .syntax import nonproj from .tokens import Doc, Span -from .errors import Errors, AlignmentError +from .errors import Errors, AlignmentError, user_warning, Warnings from .compat import path2str from . import util from .util import minibatch, itershuffle @@ -557,12 +557,16 @@ def _json_iterate(loc): loc = util.ensure_path(loc) with loc.open("rb") as file_: py_raw = file_.read() + cdef long file_length = len(py_raw) + if file_length > 2 ** 30: + user_warning(Warnings.W027.format(size=file_length)) + raw = py_raw cdef int square_depth = 0 cdef int curly_depth = 0 cdef int inside_string = 0 cdef int escape = 0 - cdef int start = -1 + cdef long start = -1 cdef char c cdef char quote = ord('"') cdef char backslash = ord("\\") @@ -570,7 +574,7 @@ def _json_iterate(loc): cdef char close_square = ord("]") cdef char open_curly = ord("{") cdef char close_curly = ord("}") - for i in range(len(py_raw)): + for i in range(file_length): c = raw[i] if escape: escape = False From 7c69d30de5aa58d330a183a0e5015e67c36ca7bc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 21 Dec 2019 21:14:52 +0100 Subject: [PATCH 04/11] Tidy up and expect warning --- spacy/tests/regression/test_issue4674.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 5f8d1573f..8d0c32eaa 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import pytest from spacy.kb import KnowledgeBase from spacy.util import ensure_path - from spacy.lang.en import English -from spacy.tests.util import make_tempdir + +from ..tests.util import make_tempdir def test_issue4674(): @@ -15,9 +16,12 @@ def test_issue4674(): vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] - kb.set_entities( - entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2] - ) + with pytest.warns(UserWarning): + kb.set_entities( + entity_list=["Q1", "Q1"], + freq_list=[32, 111], + vector_list=[vector1, vector2], + ) assert kb.get_size_entities() == 1 From 3431ac42de470a4bb73f1c6852a5ccffc07da7b1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 21 Dec 2019 21:17:45 +0100 Subject: [PATCH 05/11] Fix typo --- spacy/tests/regression/test_issue4674.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 8d0c32eaa..8fa4f9259 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -6,7 +6,7 @@ from spacy.kb import KnowledgeBase from spacy.util import ensure_path from spacy.lang.en import English -from ..tests.util import make_tempdir +from ..util import make_tempdir def test_issue4674(): From fd4a7bd2b76081f31ad7e12a25f6ba028792f661 Mon Sep 17 00:00:00 2001 From: Al Johri Date: Sun, 29 Dec 2019 08:17:28 -0500 Subject: [PATCH 06/11] sign contributor agreement for AlJohri (#4839) [ci skip] --- .github/contributors/AlJohri.md | 106 ++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/AlJohri.md diff --git a/.github/contributors/AlJohri.md b/.github/contributors/AlJohri.md new file mode 100644 index 000000000..4b2797ab0 --- /dev/null +++ b/.github/contributors/AlJohri.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Al Johri | +| Company name (if applicable) | N/A | +| Title or role (if applicable) | N/A | +| Date | December 27th, 2019 | +| GitHub username | AlJohri | +| Website (optional) | http://aljohri.com/ | From ef13e0c038c633d938891b6cedec898a9c4f3fff Mon Sep 17 00:00:00 2001 From: Ivan Echevarria Date: Sun, 29 Dec 2019 05:23:33 -0800 Subject: [PATCH 07/11] Add n_process to Language.pipe documentation (#4842) [ci skip] * Add n_process to documentation * Auto-format and add default [ci skip] Co-authored-by: Ines Montani --- .github/contributors/iechevarria.md | 106 ++++++++++++++++++++++++++++ spacy/errors.py | 7 +- website/docs/api/language.md | 7 +- 3 files changed, 113 insertions(+), 7 deletions(-) create mode 100644 .github/contributors/iechevarria.md diff --git a/.github/contributors/iechevarria.md b/.github/contributors/iechevarria.md new file mode 100644 index 000000000..f0c05efc9 --- /dev/null +++ b/.github/contributors/iechevarria.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | --------------------- | +| Name | Ivan Echevarria | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-12-24 | +| GitHub username | iechevarria | +| Website (optional) | https://echevarria.io | diff --git a/spacy/errors.py b/spacy/errors.py index ce35d706c..fd0f66cd9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -78,10 +78,9 @@ class Warnings(object): W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from " "being serialized or deserialized is deprecated. Please use the " "`exclude` argument instead. For example: exclude=['{arg}'].") - W016 = ("The keyword argument `n_threads` on the is now deprecated, as " - "the v2.x models cannot release the global interpreter lock. " - "Future versions may introduce a `n_process` argument for " - "parallel inference via multiprocessing.") + W016 = ("The keyword argument `n_threads` is now deprecated. As of v2.2.2, " + "the argument `n_process` controls parallel inference via " + "multiprocessing.") W017 = ("Alias '{alias}' already exists in the Knowledge Base.") W018 = ("Entity '{entity}' already exists in the Knowledge Base - " "ignoring the duplicate entry.") diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 6e7f6be3e..d548a1f64 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -77,9 +77,9 @@ more efficient than processing texts one-by-one. Early versions of spaCy used simple statistical models that could be efficiently multi-threaded, as we were able to entirely release Python's global interpreter lock. The multi-threading was controlled using the `n_threads` keyword argument -to the `.pipe` method. This keyword argument is now deprecated as of v2.1.0. -Future versions may introduce a `n_process` argument for parallel inference via -multiprocessing. +to the `.pipe` method. This keyword argument is now deprecated as of v2.1.0. A +new keyword argument, `n_process`, was introduced to control parallel inference +via multiprocessing in v2.2.2. @@ -98,6 +98,7 @@ multiprocessing. | `batch_size` | int | The number of texts to buffer. | | `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | +| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | | **YIELDS** | `Doc` | Documents in the order of the original text. | ## Language.update {#update tag="method"} From 1830a12578700a27c59f62cd39c38c99e8ac74eb Mon Sep 17 00:00:00 2001 From: Anastasiia Iurshina Date: Sun, 29 Dec 2019 14:24:13 +0100 Subject: [PATCH 08/11] Fixes typos (#4843) * Fixes typos * Fixes typo * Contributor agreement --- .github/contributors/iurshina.md | 106 +++++++++++++++++++++++++++++++ spacy/_align.pyx | 2 +- spacy/tokens/doc.pyx | 4 +- 3 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 .github/contributors/iurshina.md diff --git a/.github/contributors/iurshina.md b/.github/contributors/iurshina.md new file mode 100644 index 000000000..226813084 --- /dev/null +++ b/.github/contributors/iurshina.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Anastasiia Iurshina | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 28.12.2019 | +| GitHub username | iurshina | +| Website (optional) | | diff --git a/spacy/_align.pyx b/spacy/_align.pyx index 8ae7cdf4e..6786ec7ba 100644 --- a/spacy/_align.pyx +++ b/spacy/_align.pyx @@ -30,7 +30,7 @@ S[:i] -> T[:j] (at D[i,j]) S[:i+1] -> T[:j] (at D[i+1,j]) S[:i] -> T[:j+1] (at D[i,j+1]) -Further, we now we can tranform: +Further, now we can transform: S[:i+1] -> S[:i] (DEL) for 1, T[:j+1] -> T[:j] (INS) for 1. S[i+1] -> T[j+1] (SUB) for 0 or 1 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 716df1087..6bd982e35 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -994,9 +994,9 @@ cdef class Doc: order, and no span intersection is allowed. spans (Span[]): Spans to merge, in document order, with all span - intersections empty. Cannot be emty. + intersections empty. Cannot be empty. attributes (Dictionary[]): Attributes to assign to the merged tokens. By default, - must be the same lenghth as spans, emty dictionaries are allowed. + must be the same length as spans, empty dictionaries are allowed. attributes are inherited from the syntactic root of the span. RETURNS (Token): The first newly merged token. """ From db9257559c0642262a46d7acb7855e1e23b50e56 Mon Sep 17 00:00:00 2001 From: Anastasiia Iurshina Date: Sun, 29 Dec 2019 14:25:05 +0100 Subject: [PATCH 09/11] Adds script shebang (#4846) --- bin/spacy | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/spacy b/bin/spacy index 29d9a80e5..11359669c 100644 --- a/bin/spacy +++ b/bin/spacy @@ -1 +1,2 @@ +#! /bin/sh python -m spacy "$@" From 1aa2d4dac9ef414b3388743c40cc65e4880f115a Mon Sep 17 00:00:00 2001 From: Al Johri Date: Wed, 1 Jan 2020 07:15:05 -0500 Subject: [PATCH 10/11] stop rendering mathjax by default in displacy (#4840) * stop rendering mathjax by default in displacy * Replace f-string and add comment Co-authored-by: Ines Montani --- spacy/displacy/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index d2ef21dbd..c17b80aef 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -55,9 +55,10 @@ def render( html = RENDER_WRAPPER(html) if jupyter or (jupyter is None and is_in_jupyter()): # return HTML rendered by IPython display() + # See #4840 for details on span wrapper to disable mathjax from IPython.core.display import display, HTML - return display(HTML(html)) + return display(HTML('{}'.format(html))) return html From 400257a8029f8c1c51fd8f3283760e3fe492ceda Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jan 2020 01:52:18 +0100 Subject: [PATCH 11/11] Update index.md [ci skip] --- website/docs/usage/index.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 441297813..17fd8fa7b 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -124,9 +124,8 @@ interface for GPU arrays. spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`, `spacy[cuda91]`, `spacy[cuda92]` or `spacy[cuda100]`. If you know your cuda version, using the more explicit specifier allows cupy to be installed via -wheel, saving some compilation time. The specifiers should install two -libraries: [`cupy`](https://cupy.chainer.org) and -[`thinc_gpu_ops`](https://github.com/explosion/thinc_gpu_ops). +wheel, saving some compilation time. The specifiers should install +[`cupy`](https://cupy.chainer.org). ```bash $ pip install -U spacy[cuda92]