From 40d33261d62fb5faabd7288843e46f935fad14b4 Mon Sep 17 00:00:00 2001 From: Tim <33530562+tmetzl@users.noreply.github.com> Date: Sun, 10 Mar 2019 23:36:13 +0100 Subject: [PATCH 01/27] Fixed typo in example of html visualizer (#3387) * Fixed typo * Add contributor agreement for tmetzl --- .github/contributors/tmetzl.md | 106 ++++++++++++++++++++++++++ website/usage/_visualizers/_html.jade | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/tmetzl.md diff --git a/.github/contributors/tmetzl.md b/.github/contributors/tmetzl.md new file mode 100644 index 000000000..e3c8529c8 --- /dev/null +++ b/.github/contributors/tmetzl.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Tim Metzler | +| Company name (if applicable) | University of Applied Sciences Bonn-Rhein-Sieg | +| Title or role (if applicable) | | +| Date | 03/10/2019 | +| GitHub username | tmetzl | +| Website (optional) | | diff --git a/website/usage/_visualizers/_html.jade b/website/usage/_visualizers/_html.jade index 8e6948ebf..0de369141 100644 --- a/website/usage/_visualizers/_html.jade +++ b/website/usage/_visualizers/_html.jade @@ -54,7 +54,7 @@ p nlp = spacy.load('en') sentences = ["This is an example.", "This is another one."] for sent in sentences: - doc = nlp(sentence) + doc = nlp(sent) svg = displacy.render(doc, style='dep') file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' output_path = Path('/images/' + file_name) From 4e8a07c7d343ae82b78bd7375785f9e150b3e64b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 11 Mar 2019 10:45:06 +0100 Subject: [PATCH 02/27] Set version to v2.1.0a11 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index a4ee29189..4b48d4d4d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "2.1.0a10" +__version__ = "2.1.0a11" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From c399162a82d14a20428cede775bf6da7d9ef97b5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 13:34:14 +0100 Subject: [PATCH 03/27] Tidy up --- spacy/tests/regression/test_issue2001-2500.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index df5d76641..82b3a81a9 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -7,7 +7,6 @@ from spacy.tokens import Doc from spacy.displacy import render from spacy.gold import iob_to_biluo from spacy.lang.it import Italian -import numpy from spacy.lang.en import English from ..util import add_vecs_to_vocab, get_doc From ef80cfde6f5ffc6cb7be4b686c934a2ecf5c8ae6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 13:34:23 +0100 Subject: [PATCH 04/27] Fix pickling of Japanese (closes #3191) --- spacy/lang/ja/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 39a3a3385..daea9b8d6 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -9,6 +9,7 @@ from .tag_map import TAG_MAP from ...attrs import LANG from ...language import Language from ...tokens import Doc, Token +from ...compat import copy_reg from ...util import DummyTokenizer @@ -107,4 +108,11 @@ class Japanese(Language): return self.tokenizer(text) +def pickle_japanese(instance): + return Japanese, tuple() + + +copy_reg.pickle(Japanese, pickle_japanese) + + __all__ = ["Japanese"] From ebcf2bb1c34a0ba8c3818fc62f7cefbdaa7cd708 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 14:21:40 +0100 Subject: [PATCH 05/27] Add Doc.lang and Doc.lang_ --- spacy/tests/doc/test_doc_api.py | 6 ++++++ spacy/tokens/doc.pyx | 12 +++++++++++- website/docs/api/doc.md | 2 ++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 4069e018a..86c7fbf72 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -272,3 +272,9 @@ def test_doc_is_nered(en_vocab): # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.is_nered + + +def test_doc_lang(en_vocab): + doc = Doc(en_vocab, words=["Hello", "world"]) + assert doc.lang_ == "en" + assert doc.lang == en_vocab.strings["en"] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4d3ed084a..857c7b538 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -597,6 +597,16 @@ cdef class Doc: if start != self.length: yield Span(self, start, self.length) + @property + def lang(self): + """RETURNS (uint64): ID of the language of the doc's vocabulary.""" + return self.vocab.strings[self.vocab.lang] + + @property + def lang_(self): + """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'.""" + return self.vocab.lang + cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: if self.length == 0: # Flip these to false when we see the first token. @@ -748,7 +758,7 @@ cdef class Doc: # Allow strings, e.g. 'lemma' or 'LEMMA' attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] - + if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) cdef int i, col diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 953a31c2d..f5a94335f 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -654,6 +654,8 @@ The L2 norm of the document's vector representation. | `tensor` 2 | object | Container for dense vector representations. | | `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | | `user_data` | - | A generic storage area, for user custom data. | +| `lang` 2.1 | int | Language of the document's vocabulary. | +| `lang_` 2.1 | unicode | Language of the document's vocabulary. | | `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | | `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | | `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | From ee4f312e89fe262a682011da3a7881bfbf47fcdf Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 14:22:23 +0100 Subject: [PATCH 06/27] Add writing_system to ArabicDefaults (experimental) --- spacy/lang/ar/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py index c6ff071cf..c120703f6 100644 --- a/spacy/lang/ar/__init__.py +++ b/spacy/lang/ar/__init__.py @@ -23,6 +23,7 @@ class ArabicDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS suffixes = TOKENIZER_SUFFIXES + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Arabic(Language): From 39a4741e264d75599508f1a4d4f4fa797c05c263 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 11 Mar 2019 15:23:20 +0100 Subject: [PATCH 07/27] Add support for vocab.writing_system property (#3390) * Add xfail test for vocab.writing_system * Add vocab.writing_system property * Set Language.Defaults.writing_system * Set default writing system * Remove xfail on test_vocab_writing_system --- spacy/lang/fa/__init__.py | 1 + spacy/lang/he/__init__.py | 2 +- spacy/lang/ja/__init__.py | 1 + spacy/lang/zh/__init__.py | 2 +- spacy/language.py | 1 + spacy/tests/vocab_vectors/test_vocab_api.py | 5 +++++ spacy/util.py | 12 ++++++++++++ spacy/vocab.pyx | 11 +++++++++++ 8 files changed, 33 insertions(+), 2 deletions(-) diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 8756c3ff9..4041ec635 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -27,6 +27,7 @@ class PersianDefaults(Language.Defaults): stop_words = STOP_WORDS tag_map = TAG_MAP suffixes = TOKENIZER_SUFFIXES + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Persian(Language): diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index c7ba4ebf8..0ad65a0b4 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -14,7 +14,7 @@ class HebrewDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: "he" tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = STOP_WORDS - + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Hebrew(Language): lang = "he" diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index daea9b8d6..e35967409 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -94,6 +94,7 @@ class JapaneseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @classmethod def create_tokenizer(cls, nlp=None): diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 04a7d1508..708e446ba 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -14,7 +14,7 @@ class ChineseDefaults(Language.Defaults): use_jieba = True tokenizer_exceptions = BASE_EXCEPTIONS stop_words = STOP_WORDS - + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} class Chinese(Language): lang = "zh" diff --git a/spacy/language.py b/spacy/language.py index 44a819132..e97b74a77 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -94,6 +94,7 @@ class BaseDefaults(object): morph_rules = {} lex_attr_getters = LEX_ATTRS syntax_iterators = {} + writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} class Language(object): diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index 8c826e8c3..59a911830 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -45,3 +45,8 @@ def test_vocab_api_contains(en_vocab, text): _ = en_vocab[text] # noqa: F841 assert text in en_vocab assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab + + +def test_vocab_writing_system(en_vocab): + assert en_vocab.writing_system["direction"] == "ltr" + assert en_vocab.writing_system["has_case"] == True diff --git a/spacy/util.py b/spacy/util.py index 0066b196d..137d466d5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -38,6 +38,18 @@ def set_env_log(value): _PRINT_ENV = value +def lang_class_is_loaded(lang): + """Check whether a Language class is already loaded. Language classes are + loaded lazily, to avoid expensive setup code associated with the language + data. + + lang (unicode): Two-letter language code, e.g. 'en'. + RETURNS (bool): Whether a Language class has been loaded. + """ + global LANGUAGES + return lang in LANGUAGES + + def get_lang_class(lang): """Import and load a Language class. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0923f977a..90e7dca34 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -67,6 +67,17 @@ cdef class Vocab: langfunc = self.lex_attr_getters.get(LANG, None) return langfunc("_") if langfunc else "" + property writing_system: + """A dict with information about the language's writing system. To get + the data, we use the vocab.lang property to fetch the Language class. + If the Language class is not loaded, an empty dict is returned. + """ + def __get__(self): + if not util.lang_class_is_loaded(self.lang): + return {} + lang_class = util.get_lang_class(self.lang) + return dict(lang_class.Defaults.writing_system) + def __len__(self): """The current number of lexemes stored. From 25cb764e64814aa1ad61b8a854cb6404b38f9753 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 15:23:53 +0100 Subject: [PATCH 08/27] Document new API [ci skip] --- website/docs/api/top-level.md | 18 ++++++++++++++++++ website/docs/api/vocab.md | 11 ++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 43f075b53..ff125d2eb 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -351,6 +351,24 @@ the two-letter language code. | `name` | unicode | Two-letter language code, e.g. `'en'`. | | `cls` | `Language` | The language class, e.g. `English`. | +### util.lang_class_is_loaded (#util.lang_class_is_loaded tag="function" new="2.1") + +Check whether a `Language` class is already loaded. `Language` classes are +loaded lazily, to avoid expensive setup code associated with the language data. + +> #### Example +> +> ```python +> lang_cls = util.get_lang_class("en") +> assert util.lang_class_is_loaded("en") is True +> assert util.lang_class_is_loaded("de") is False +> ``` + +| Name | Type | Description | +| ----------- | ------- | -------------------------------------- | +| `name` | unicode | Two-letter language code, e.g. `'en'`. | +| **RETURNS** | bool | Whether the class has been loaded. | + ### util.load_model {#util.load_model tag="function" new="2"} Load a model from a shortcut link, package or data path. If called with a diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 64e153331..cd21a91d6 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -288,11 +288,12 @@ Load state from a binary string. > assert type(PERSON) == int > ``` -| Name | Type | Description | -| ------------------------------------ | ------------- | --------------------------------------------- | -| `strings` | `StringStore` | A table managing the string-to-int mapping. | -| `vectors` 2 | `Vectors` | A table associating word IDs to word vectors. | -| `vectors_length` | int | Number of dimensions for each word vector. | +| Name | Type | Description | +| --------------------------------------------- | ------------- | ------------------------------------------------------------ | +| `strings` | `StringStore` | A table managing the string-to-int mapping. | +| `vectors` 2 | `Vectors` | A table associating word IDs to word vectors. | +| `vectors_length` | int | Number of dimensions for each word vector. | +| `writing_system` 2.1 | dict | A dict with information about the language's writing system. | ## Serialization fields {#serialization-fields} From c5a407e95af8127fff0e5e5c8e0ebcaeed0288e1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 15:28:22 +0100 Subject: [PATCH 09/27] Fix code style --- spacy/tests/vocab_vectors/test_vocab_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index 59a911830..d22db2d8b 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -49,4 +49,4 @@ def test_vocab_api_contains(en_vocab, text): def test_vocab_writing_system(en_vocab): assert en_vocab.writing_system["direction"] == "ltr" - assert en_vocab.writing_system["has_case"] == True + assert en_vocab.writing_system["has_case"] is True From c3df4d1108cfec0f5b612bb426bf7a0a9220960f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 15:28:34 +0100 Subject: [PATCH 10/27] Move displaCy tests to own file --- spacy/tests/test_displacy.py | 68 ++++++++++++++++++++++++++++++++++++ spacy/tests/test_misc.py | 64 --------------------------------- 2 files changed, 68 insertions(+), 64 deletions(-) create mode 100644 spacy/tests/test_displacy.py diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py new file mode 100644 index 000000000..a65060ea7 --- /dev/null +++ b/spacy/tests/test_displacy.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy import displacy +from spacy.tokens import Span + +from .util import get_doc + + +def test_displacy_parse_ents(en_vocab): + """Test that named entities on a Doc are converted into displaCy's format.""" + doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] + ents = displacy.parse_ents(doc) + assert isinstance(ents, dict) + assert ents["text"] == "But Google is starting from behind " + assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}] + + +def test_displacy_parse_deps(en_vocab): + """Test that deps and tags on a Doc are converted into displaCy's format.""" + words = ["This", "is", "a", "sentence"] + heads = [1, 0, 1, -2] + pos = ["DET", "VERB", "DET", "NOUN"] + tags = ["DT", "VBZ", "DT", "NN"] + deps = ["nsubj", "ROOT", "det", "attr"] + doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps) + deps = displacy.parse_deps(doc) + assert isinstance(deps, dict) + assert deps["words"] == [ + {"text": "This", "tag": "DET"}, + {"text": "is", "tag": "VERB"}, + {"text": "a", "tag": "DET"}, + {"text": "sentence", "tag": "NOUN"}, + ] + assert deps["arcs"] == [ + {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "det", "dir": "left"}, + {"start": 1, "end": 3, "label": "attr", "dir": "right"}, + ] + + +def test_displacy_spans(en_vocab): + """Test that displaCy can render Spans.""" + doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] + html = displacy.render(doc[1:4], style="ent") + assert html.startswith("TEST") + + +def test_displacy_raises_for_wrong_type(en_vocab): + with pytest.raises(ValueError): + displacy.render("hello world") diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 32cc514e5..64112923f 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -4,13 +4,9 @@ from __future__ import unicode_literals import pytest from pathlib import Path from spacy import util -from spacy import displacy from spacy import prefer_gpu, require_gpu -from spacy.tokens import Span from spacy._ml import PrecomputableAffine -from .util import get_doc - @pytest.mark.parametrize("text", ["hello/world", "hello world"]) def test_util_ensure_path_succeeds(text): @@ -31,66 +27,6 @@ def test_util_get_package_path(package): assert isinstance(path, Path) -def test_displacy_parse_ents(en_vocab): - """Test that named entities on a Doc are converted into displaCy's format.""" - doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) - doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] - ents = displacy.parse_ents(doc) - assert isinstance(ents, dict) - assert ents["text"] == "But Google is starting from behind " - assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}] - - -def test_displacy_parse_deps(en_vocab): - """Test that deps and tags on a Doc are converted into displaCy's format.""" - words = ["This", "is", "a", "sentence"] - heads = [1, 0, 1, -2] - pos = ["DET", "VERB", "DET", "NOUN"] - tags = ["DT", "VBZ", "DT", "NN"] - deps = ["nsubj", "ROOT", "det", "attr"] - doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps) - deps = displacy.parse_deps(doc) - assert isinstance(deps, dict) - assert deps["words"] == [ - {"text": "This", "tag": "DET"}, - {"text": "is", "tag": "VERB"}, - {"text": "a", "tag": "DET"}, - {"text": "sentence", "tag": "NOUN"}, - ] - assert deps["arcs"] == [ - {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, - {"start": 2, "end": 3, "label": "det", "dir": "left"}, - {"start": 1, "end": 3, "label": "attr", "dir": "right"}, - ] - - -def test_displacy_spans(en_vocab): - """Test that displaCy can render Spans.""" - doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) - doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] - html = displacy.render(doc[1:4], style="ent") - assert html.startswith("TEST") - - -def test_displacy_raises_for_wrong_type(en_vocab): - with pytest.raises(ValueError): - displacy.render("hello world") - - def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP) assert model.W.shape == (nF, nO, nP, nI) From db79a704bf923602597a933a127fcdca1fc5edfc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 11 Mar 2019 15:46:52 +0100 Subject: [PATCH 11/27] Add xfail tests for token.conjuncts --- spacy/tests/doc/test_token_api.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 3dd9935b2..9e3f88a48 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -199,3 +199,34 @@ def test_token0_has_sent_start_true(): assert doc[0].is_sent_start is True assert doc[1].is_sent_start is None assert not doc.is_sentenced + + +@pytest.mark.xfail +def test_token_api_conjuncts_chain(en_vocab): + words = "The boy and the girl and the man went .".split() + heads = [1, 7, -1, 1, -3, -1, 1, -3, 0, -1] + deps = ["det", "nsubj", "cc", "det", "conj", "cc", "det", "conj", "ROOT", "punct"] + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + assert [w.text for w in doc[1].conjuncts] == ["girl", "man"] + assert [w.text for w in doc[4].conjuncts] == ["boy", "man"] + assert [w.text for w in doc[7].conjuncts] == ["boy", "girl"] + + +@pytest.mark.xfail +def test_token_api_conjuncts_simple(en_vocab): + words = "They came and went .".split() + heads = [1, 0, -1, -2, -1] + deps = ["nsubj", "ROOT", "cc", "conj"] + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + assert [w.text for w in doc[1].conjuncts] == ["went"] + assert [w.text for w in doc[3].conjuncts] == ["came"] + + +@pytest.mark.xfail +def test_token_api_non_conjuncts(en_vocab): + words = "They came .".split() + heads = [1, 0, -1] + deps = ["nsubj", "ROOT", "punct"] + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + assert [w.text for w in doc[0].conjuncts] == [] + assert [w.text for w in doc[1].conjuncts] == [] From 47e9c274efe55716a9c499744ccfed55632a06e3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 15:59:09 +0100 Subject: [PATCH 12/27] Tidy up property code style (#3391) Use decorator if properties only have a getter and existing syntax if there's getter and setter --- spacy/lexeme.pyx | 26 +- spacy/syntax/arc_eager.pyx | 8 +- spacy/syntax/ner.pyx | 6 +- spacy/tokens/doc.pyx | 90 +++---- spacy/tokens/span.pyx | 352 +++++++++++++-------------- spacy/tokens/token.pyx | 487 ++++++++++++++++++------------------- spacy/vocab.pyx | 12 +- 7 files changed, 490 insertions(+), 491 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 8a1c0b2de..5b88e8fcc 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -161,17 +161,17 @@ cdef class Lexeme: Lexeme.c_from_bytes(self.c, lex_data) self.orth = self.c.orth - property has_vector: + @property + def has_vector(self): """RETURNS (bool): Whether a word vector is associated with the object. """ - def __get__(self): - return self.vocab.has_vector(self.c.orth) + return self.vocab.has_vector(self.c.orth) - property vector_norm: + @property + def vector_norm(self): """RETURNS (float): The L2 norm of the vector representation.""" - def __get__(self): - vector = self.vector - return numpy.sqrt((vector**2).sum()) + vector = self.vector + return numpy.sqrt((vector**2).sum()) property vector: """A real-valued meaning representation. @@ -209,17 +209,17 @@ cdef class Lexeme: def __set__(self, float sentiment): self.c.sentiment = sentiment - property orth_: + @property + def orth_(self): """RETURNS (unicode): The original verbatim text of the lexeme (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.""" - def __get__(self): - return self.vocab.strings[self.c.orth] + return self.vocab.strings[self.c.orth] - property text: + @property + def text(self): """RETURNS (unicode): The original verbatim text of the lexeme.""" - def __get__(self): - return self.orth_ + return self.orth_ property lower: """RETURNS (unicode): Lowercase form of the lexeme.""" diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 2dd269a53..c5b4e4469 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -369,9 +369,9 @@ cdef class ArcEager(TransitionSystem): actions[LEFT].setdefault('dep', 0) return actions - property action_types: - def __get__(self): - return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) + @property + def action_types(self): + return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) def get_cost(self, StateClass state, GoldParse gold, action): cdef Transition t = self.lookup_transition(action) @@ -384,7 +384,7 @@ cdef class ArcEager(TransitionSystem): cdef Transition t = self.lookup_transition(action) t.do(state.c, t.label) return state - + def is_gold_parse(self, StateClass state, GoldParse gold): predicted = set() truth = set() diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 804167b0e..c2a85bfe1 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -80,9 +80,9 @@ cdef class BiluoPushDown(TransitionSystem): actions[action][label] += 1 return actions - property action_types: - def __get__(self): - return (BEGIN, IN, LAST, UNIT, OUT) + @property + def action_types(self): + return (BEGIN, IN, LAST, UNIT, OUT) def move_name(self, int move, attr_t label): if move == OUT: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 857c7b538..dd610bd6d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -384,7 +384,8 @@ cdef class Doc: xp = get_array_module(vector) return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) - property has_vector: + @property + def has_vector(self): """A boolean value indicating whether a word vector is associated with the object. @@ -392,15 +393,14 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#has_vector """ - def __get__(self): - if "has_vector" in self.user_hooks: - return self.user_hooks["has_vector"](self) - elif self.vocab.vectors.data.size: - return True - elif self.tensor.size: - return True - else: - return False + if "has_vector" in self.user_hooks: + return self.user_hooks["has_vector"](self) + elif self.vocab.vectors.data.size: + return True + elif self.tensor.size: + return True + else: + return False property vector: """A real-valued meaning representation. Defaults to an average of the @@ -453,22 +453,22 @@ cdef class Doc: def __set__(self, value): self._vector_norm = value - property text: + @property + def text(self): """A unicode representation of the document text. RETURNS (unicode): The original verbatim text of the document. """ - def __get__(self): - return "".join(t.text_with_ws for t in self) + return "".join(t.text_with_ws for t in self) - property text_with_ws: + @property + def text_with_ws(self): """An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. RETURNS (unicode): The original verbatim text of the document. """ - def __get__(self): - return self.text + return self.text property ents: """The named entities in the document. Returns a tuple of named entity @@ -545,7 +545,8 @@ cdef class Doc: # Set start as B self.c[start].ent_iob = 3 - property noun_chunks: + @property + def noun_chunks(self): """Iterate over the base noun phrases in the document. Yields base noun-phrase #[code Span] objects, if the document has been syntactically parsed. A base noun phrase, or "NP chunk", is a noun @@ -557,22 +558,22 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#noun_chunks """ - def __get__(self): - if not self.is_parsed: - raise ValueError(Errors.E029) - # Accumulate the result before beginning to iterate over it. This - # prevents the tokenisation from being changed out from under us - # during the iteration. The tricky thing here is that Span accepts - # its tokenisation changing, so it's okay once we have the Span - # objects. See Issue #375. - spans = [] - if self.noun_chunks_iterator is not None: - for start, end, label in self.noun_chunks_iterator(self): - spans.append(Span(self, start, end, label=label)) - for span in spans: - yield span + if not self.is_parsed: + raise ValueError(Errors.E029) + # Accumulate the result before beginning to iterate over it. This + # prevents the tokenisation from being changed out from under us + # during the iteration. The tricky thing here is that Span accepts + # its tokenisation changing, so it's okay once we have the Span + # objects. See Issue #375. + spans = [] + if self.noun_chunks_iterator is not None: + for start, end, label in self.noun_chunks_iterator(self): + spans.append(Span(self, start, end, label=label)) + for span in spans: + yield span - property sents: + @property + def sents(self): """Iterate over the sentences in the document. Yields sentence `Span` objects. Sentence spans have no label. To improve accuracy on informal texts, spaCy calculates sentence boundaries from the syntactic @@ -583,19 +584,18 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#sents """ - def __get__(self): - if not self.is_sentenced: - raise ValueError(Errors.E030) - if "sents" in self.user_hooks: - yield from self.user_hooks["sents"](self) - else: - start = 0 - for i in range(1, self.length): - if self.c[i].sent_start == 1: - yield Span(self, start, i) - start = i - if start != self.length: - yield Span(self, start, self.length) + if not self.is_sentenced: + raise ValueError(Errors.E030) + if "sents" in self.user_hooks: + yield from self.user_hooks["sents"](self) + else: + start = 0 + for i in range(1, self.length): + if self.c[i].sent_start == 1: + yield Span(self, start, i) + start = i + if start != self.length: + yield Span(self, start, self.length) @property def lang(self): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 48e791102..d38d6e0fc 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -322,46 +322,47 @@ cdef class Span: self.start = start self.end = end + 1 - property vocab: + @property + def vocab(self): """RETURNS (Vocab): The Span's Doc's vocab.""" - def __get__(self): - return self.doc.vocab + return self.doc.vocab - property sent: + @property + def sent(self): """RETURNS (Span): The sentence span that the span is a part of.""" - def __get__(self): - if "sent" in self.doc.user_span_hooks: - return self.doc.user_span_hooks["sent"](self) - # This should raise if not parsed / no custom sentence boundaries - self.doc.sents - # If doc is parsed we can use the deps to find the sentence - # otherwise we use the `sent_start` token attribute - cdef int n = 0 - cdef int i - if self.doc.is_parsed: - root = &self.doc.c[self.start] - while root.head != 0: - root += root.head - n += 1 - if n >= self.doc.length: - raise RuntimeError(Errors.E038) - return self.doc[root.l_edge:root.r_edge + 1] - elif self.doc.is_sentenced: - # Find start of the sentence - start = self.start - while self.doc.c[start].sent_start != 1 and start > 0: - start += -1 - # Find end of the sentence - end = self.end - n = 0 - while end < self.doc.length and self.doc.c[end].sent_start != 1: - end += 1 - n += 1 - if n >= self.doc.length: - break - return self.doc[start:end] + if "sent" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["sent"](self) + # This should raise if not parsed / no custom sentence boundaries + self.doc.sents + # If doc is parsed we can use the deps to find the sentence + # otherwise we use the `sent_start` token attribute + cdef int n = 0 + cdef int i + if self.doc.is_parsed: + root = &self.doc.c[self.start] + while root.head != 0: + root += root.head + n += 1 + if n >= self.doc.length: + raise RuntimeError(Errors.E038) + return self.doc[root.l_edge:root.r_edge + 1] + elif self.doc.is_sentenced: + # Find start of the sentence + start = self.start + while self.doc.c[start].sent_start != 1 and start > 0: + start += -1 + # Find end of the sentence + end = self.end + n = 0 + while end < self.doc.length and self.doc.c[end].sent_start != 1: + end += 1 + n += 1 + if n >= self.doc.length: + break + return self.doc[start:end] - property ents: + @property + def ents(self): """The named entities in the span. Returns a tuple of named entity `Span` objects, if the entity recognizer has been applied. @@ -369,14 +370,14 @@ cdef class Span: DOCS: https://spacy.io/api/span#ents """ - def __get__(self): - ents = [] - for ent in self.doc.ents: - if ent.start >= self.start and ent.end <= self.end: - ents.append(ent) - return ents + ents = [] + for ent in self.doc.ents: + if ent.start >= self.start and ent.end <= self.end: + ents.append(ent) + return ents - property has_vector: + @property + def has_vector(self): """A boolean value indicating whether a word vector is associated with the object. @@ -384,17 +385,17 @@ cdef class Span: DOCS: https://spacy.io/api/span#has_vector """ - def __get__(self): - if "has_vector" in self.doc.user_span_hooks: - return self.doc.user_span_hooks["has_vector"](self) - elif self.vocab.vectors.data.size > 0: - return any(token.has_vector for token in self) - elif self.doc.tensor.size > 0: - return True - else: - return False + if "has_vector" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["has_vector"](self) + elif self.vocab.vectors.data.size > 0: + return any(token.has_vector for token in self) + elif self.doc.tensor.size > 0: + return True + else: + return False - property vector: + @property + def vector(self): """A real-valued meaning representation. Defaults to an average of the token vectors. @@ -403,61 +404,61 @@ cdef class Span: DOCS: https://spacy.io/api/span#vector """ - def __get__(self): - if "vector" in self.doc.user_span_hooks: - return self.doc.user_span_hooks["vector"](self) - if self._vector is None: - self._vector = sum(t.vector for t in self) / len(self) - return self._vector + if "vector" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["vector"](self) + if self._vector is None: + self._vector = sum(t.vector for t in self) / len(self) + return self._vector - property vector_norm: + @property + def vector_norm(self): """The L2 norm of the span's vector representation. RETURNS (float): The L2 norm of the vector representation. DOCS: https://spacy.io/api/span#vector_norm """ - def __get__(self): - if "vector_norm" in self.doc.user_span_hooks: - return self.doc.user_span_hooks["vector"](self) - cdef float value - cdef double norm = 0 - if self._vector_norm is None: - norm = 0 - for value in self.vector: - norm += value * value - self._vector_norm = sqrt(norm) if norm != 0 else 0 - return self._vector_norm + if "vector_norm" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["vector"](self) + cdef float value + cdef double norm = 0 + if self._vector_norm is None: + norm = 0 + for value in self.vector: + norm += value * value + self._vector_norm = sqrt(norm) if norm != 0 else 0 + return self._vector_norm - property sentiment: + @property + def sentiment(self): """RETURNS (float): A scalar value indicating the positivity or negativity of the span. """ - def __get__(self): - if "sentiment" in self.doc.user_span_hooks: - return self.doc.user_span_hooks["sentiment"](self) - else: - return sum([token.sentiment for token in self]) / len(self) + if "sentiment" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["sentiment"](self) + else: + return sum([token.sentiment for token in self]) / len(self) - property text: + @property + def text(self): """RETURNS (unicode): The original verbatim text of the span.""" - def __get__(self): - text = self.text_with_ws - if self[-1].whitespace_: - text = text[:-1] - return text + text = self.text_with_ws + if self[-1].whitespace_: + text = text[:-1] + return text - property text_with_ws: + @property + def text_with_ws(self): """The text content of the span with a trailing whitespace character if the last token has one. RETURNS (unicode): The text content of the span (with trailing whitespace). """ - def __get__(self): - return "".join([t.text_with_ws for t in self]) + return "".join([t.text_with_ws for t in self]) - property noun_chunks: + @property + def noun_chunks(self): """Yields base noun-phrase `Span` objects, if the document has been syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no @@ -468,23 +469,23 @@ cdef class Span: DOCS: https://spacy.io/api/span#noun_chunks """ - def __get__(self): - if not self.doc.is_parsed: - raise ValueError(Errors.E029) - # Accumulate the result before beginning to iterate over it. This - # prevents the tokenisation from being changed out from under us - # during the iteration. The tricky thing here is that Span accepts - # its tokenisation changing, so it's okay once we have the Span - # objects. See Issue #375 - spans = [] - cdef attr_t label - if self.doc.noun_chunks_iterator is not None: - for start, end, label in self.doc.noun_chunks_iterator(self): - spans.append(Span(self.doc, start, end, label=label)) - for span in spans: - yield span + if not self.doc.is_parsed: + raise ValueError(Errors.E029) + # Accumulate the result before beginning to iterate over it. This + # prevents the tokenisation from being changed out from under us + # during the iteration. The tricky thing here is that Span accepts + # its tokenisation changing, so it's okay once we have the Span + # objects. See Issue #375 + spans = [] + cdef attr_t label + if self.doc.noun_chunks_iterator is not None: + for start, end, label in self.doc.noun_chunks_iterator(self): + spans.append(Span(self.doc, start, end, label=label)) + for span in spans: + yield span - property root: + @property + def root(self): """The token with the shortest path to the root of the sentence (or the root itself). If multiple tokens are equally high in the tree, the first token is taken. @@ -493,41 +494,41 @@ cdef class Span: DOCS: https://spacy.io/api/span#root """ - def __get__(self): - self._recalculate_indices() - if "root" in self.doc.user_span_hooks: - return self.doc.user_span_hooks["root"](self) - # This should probably be called 'head', and the other one called - # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ - cdef int i - # First, we scan through the Span, and check whether there's a word - # with head==0, i.e. a sentence root. If so, we can return it. The - # longer the span, the more likely it contains a sentence root, and - # in this case we return in linear time. - for i in range(self.start, self.end): - if self.doc.c[i].head == 0: - return self.doc[i] - # If we don't have a sentence root, we do something that's not so - # algorithmically clever, but I think should be quite fast, - # especially for short spans. - # For each word, we count the path length, and arg min this measure. - # We could use better tree logic to save steps here...But I - # think this should be okay. - cdef int current_best = self.doc.length - cdef int root = -1 - for i in range(self.start, self.end): - if self.start <= (i+self.doc.c[i].head) < self.end: - continue - words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length) - if words_to_root < current_best: - current_best = words_to_root - root = i - if root == -1: - return self.doc[self.start] - else: - return self.doc[root] + self._recalculate_indices() + if "root" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["root"](self) + # This should probably be called 'head', and the other one called + # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ + cdef int i + # First, we scan through the Span, and check whether there's a word + # with head==0, i.e. a sentence root. If so, we can return it. The + # longer the span, the more likely it contains a sentence root, and + # in this case we return in linear time. + for i in range(self.start, self.end): + if self.doc.c[i].head == 0: + return self.doc[i] + # If we don't have a sentence root, we do something that's not so + # algorithmically clever, but I think should be quite fast, + # especially for short spans. + # For each word, we count the path length, and arg min this measure. + # We could use better tree logic to save steps here...But I + # think this should be okay. + cdef int current_best = self.doc.length + cdef int root = -1 + for i in range(self.start, self.end): + if self.start <= (i+self.doc.c[i].head) < self.end: + continue + words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length) + if words_to_root < current_best: + current_best = words_to_root + root = i + if root == -1: + return self.doc[self.start] + else: + return self.doc[root] - property lefts: + @property + def lefts(self): """Tokens that are to the left of the span, whose head is within the `Span`. @@ -535,13 +536,13 @@ cdef class Span: DOCS: https://spacy.io/api/span#lefts """ - def __get__(self): - for token in reversed(self): # Reverse, so we get tokens in order - for left in token.lefts: - if left.i < self.start: - yield left + for token in reversed(self): # Reverse, so we get tokens in order + for left in token.lefts: + if left.i < self.start: + yield left - property rights: + @property + def rights(self): """Tokens that are to the right of the Span, whose head is within the `Span`. @@ -549,13 +550,13 @@ cdef class Span: DOCS: https://spacy.io/api/span#rights """ - def __get__(self): - for token in self: - for right in token.rights: - if right.i >= self.end: - yield right + for token in self: + for right in token.rights: + if right.i >= self.end: + yield right - property n_lefts: + @property + def n_lefts(self): """The number of tokens that are to the left of the span, whose heads are within the span. @@ -564,10 +565,10 @@ cdef class Span: DOCS: https://spacy.io/api/span#n_lefts """ - def __get__(self): - return len(list(self.lefts)) + return len(list(self.lefts)) - property n_rights: + @property + def n_rights(self): """The number of tokens that are to the right of the span, whose heads are within the span. @@ -576,22 +577,21 @@ cdef class Span: DOCS: https://spacy.io/api/span#n_rights """ - def __get__(self): - return len(list(self.rights)) + return len(list(self.rights)) - property subtree: + @property + def subtree(self): """Tokens within the span and tokens which descend from them. YIELDS (Token): A token within the span, or a descendant from it. DOCS: https://spacy.io/api/span#subtree """ - def __get__(self): - for word in self.lefts: - yield from word.subtree - yield from self - for word in self.rights: - yield from word.subtree + for word in self.lefts: + yield from word.subtree + yield from self + for word in self.rights: + yield from word.subtree property ent_id: """RETURNS (uint64): The entity ID.""" @@ -609,33 +609,33 @@ cdef class Span: def __set__(self, hash_t key): raise NotImplementedError(TempErrors.T007.format(attr="ent_id_")) - property orth_: + @property + def orth_(self): """Verbatim text content (identical to `Span.text`). Exists mostly for consistency with other attributes. RETURNS (unicode): The span's text.""" - def __get__(self): - return self.text + return self.text - property lemma_: + @property + def lemma_(self): """RETURNS (unicode): The span's lemma.""" - def __get__(self): - return " ".join([t.lemma_ for t in self]).strip() + return " ".join([t.lemma_ for t in self]).strip() - property upper_: + @property + def upper_(self): """Deprecated. Use `Span.text.upper()` instead.""" - def __get__(self): - return "".join([t.text_with_ws.upper() for t in self]).strip() + return "".join([t.text_with_ws.upper() for t in self]).strip() - property lower_: + @property + def lower_(self): """Deprecated. Use `Span.text.lower()` instead.""" - def __get__(self): - return "".join([t.text_with_ws.lower() for t in self]).strip() + return "".join([t.text_with_ws.lower() for t in self]).strip() - property string: + @property + def string(self): """Deprecated: Use `Span.text_with_ws` instead.""" - def __get__(self): - return "".join([t.text_with_ws for t in self]) + return "".join([t.text_with_ws for t in self]) property label_: """RETURNS (unicode): The span's label.""" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 771e43549..960679e9c 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -218,111 +218,111 @@ cdef class Token: xp = get_array_module(vector) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - property lex_id: + @property + def lex_id(self): """RETURNS (int): Sequential ID of the token's lexical type.""" - def __get__(self): - return self.c.lex.id + return self.c.lex.id - property rank: + @property + def rank(self): """RETURNS (int): Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors.""" - def __get__(self): - return self.c.lex.id + return self.c.lex.id - property string: + @property + def string(self): """Deprecated: Use Token.text_with_ws instead.""" - def __get__(self): - return self.text_with_ws + return self.text_with_ws - property text: + @property + def text(self): """RETURNS (unicode): The original verbatim text of the token.""" - def __get__(self): - return self.orth_ + return self.orth_ - property text_with_ws: + @property + def text_with_ws(self): """RETURNS (unicode): The text content of the span (with trailing whitespace). """ - def __get__(self): - cdef unicode orth = self.vocab.strings[self.c.lex.orth] - if self.c.spacy: - return orth + " " - else: - return orth + cdef unicode orth = self.vocab.strings[self.c.lex.orth] + if self.c.spacy: + return orth + " " + else: + return orth - property prob: + @property + def prob(self): """RETURNS (float): Smoothed log probability estimate of token type.""" - def __get__(self): - return self.c.lex.prob + return self.c.lex.prob - property sentiment: + @property + def sentiment(self): """RETURNS (float): A scalar value indicating the positivity or negativity of the token.""" - def __get__(self): - if "sentiment" in self.doc.user_token_hooks: - return self.doc.user_token_hooks["sentiment"](self) - return self.c.lex.sentiment + if "sentiment" in self.doc.user_token_hooks: + return self.doc.user_token_hooks["sentiment"](self) + return self.c.lex.sentiment - property lang: + @property + def lang(self): """RETURNS (uint64): ID of the language of the parent document's vocabulary. """ - def __get__(self): - return self.c.lex.lang + return self.c.lex.lang - property idx: + @property + def idx(self): """RETURNS (int): The character offset of the token within the parent document. """ - def __get__(self): - return self.c.idx + return self.c.idx - property cluster: + @property + def cluster(self): """RETURNS (int): Brown cluster ID.""" - def __get__(self): - return self.c.lex.cluster + return self.c.lex.cluster - property orth: + @property + def orth(self): """RETURNS (uint64): ID of the verbatim text content.""" - def __get__(self): - return self.c.lex.orth + return self.c.lex.orth - property lower: + @property + def lower(self): """RETURNS (uint64): ID of the lowercase token text.""" - def __get__(self): - return self.c.lex.lower + return self.c.lex.lower - property norm: + @property + def norm(self): """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of the token text. Usually set in the language's tokenizer exceptions or norm exceptions. """ - def __get__(self): - if self.c.norm == 0: - return self.c.lex.norm - else: - return self.c.norm + if self.c.norm == 0: + return self.c.lex.norm + else: + return self.c.norm - property shape: + @property + def shape(self): """RETURNS (uint64): ID of the token's shape, a transform of the tokens's string, to show orthographic features (e.g. "Xxxx", "dd"). """ - def __get__(self): - return self.c.lex.shape + return self.c.lex.shape - property prefix: + @property + def prefix(self): """RETURNS (uint64): ID of a length-N substring from the start of the token. Defaults to `N=1`. """ - def __get__(self): - return self.c.lex.prefix + return self.c.lex.prefix - property suffix: + @property + def suffix(self): """RETURNS (uint64): ID of a length-N substring from the end of the token. Defaults to `N=3`. """ - def __get__(self): - return self.c.lex.suffix + return self.c.lex.suffix property lemma: """RETURNS (uint64): ID of the base form of the word, with no @@ -362,7 +362,8 @@ cdef class Token: def __set__(self, attr_t label): self.c.dep = label - property has_vector: + @property + def has_vector(self): """A boolean value indicating whether a word vector is associated with the object. @@ -370,14 +371,14 @@ cdef class Token: DOCS: https://spacy.io/api/token#has_vector """ - def __get__(self): - if 'has_vector' in self.doc.user_token_hooks: - return self.doc.user_token_hooks["has_vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return True - return self.vocab.has_vector(self.c.lex.orth) + if "has_vector" in self.doc.user_token_hooks: + return self.doc.user_token_hooks["has_vector"](self) + if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: + return True + return self.vocab.has_vector(self.c.lex.orth) - property vector: + @property + def vector(self): """A real-valued meaning representation. RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array @@ -385,28 +386,28 @@ cdef class Token: DOCS: https://spacy.io/api/token#vector """ - def __get__(self): - if 'vector' in self.doc.user_token_hooks: - return self.doc.user_token_hooks["vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return self.doc.tensor[self.i] - else: - return self.vocab.get_vector(self.c.lex.orth) + if "vector" in self.doc.user_token_hooks: + return self.doc.user_token_hooks["vector"](self) + if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: + return self.doc.tensor[self.i] + else: + return self.vocab.get_vector(self.c.lex.orth) - property vector_norm: + @property + def vector_norm(self): """The L2 norm of the token's vector representation. RETURNS (float): The L2 norm of the vector representation. DOCS: https://spacy.io/api/token#vector_norm """ - def __get__(self): - if 'vector_norm' in self.doc.user_token_hooks: - return self.doc.user_token_hooks["vector_norm"](self) - vector = self.vector - return numpy.sqrt((vector ** 2).sum()) + if "vector_norm" in self.doc.user_token_hooks: + return self.doc.user_token_hooks["vector_norm"](self) + vector = self.vector + return numpy.sqrt((vector ** 2).sum()) - property n_lefts: + @property + def n_lefts(self): """The number of leftward immediate children of the word, in the syntactic dependency parse. @@ -415,10 +416,10 @@ cdef class Token: DOCS: https://spacy.io/api/token#n_lefts """ - def __get__(self): - return self.c.l_kids + return self.c.l_kids - property n_rights: + @property + def n_rights(self): """The number of rightward immediate children of the word, in the syntactic dependency parse. @@ -427,15 +428,14 @@ cdef class Token: DOCS: https://spacy.io/api/token#n_rights """ - def __get__(self): - return self.c.r_kids + return self.c.r_kids - property sent: + @property + def sent(self): """RETURNS (Span): The sentence span that the token is a part of.""" - def __get__(self): - if 'sent' in self.doc.user_token_hooks: - return self.doc.user_token_hooks["sent"](self) - return self.doc[self.i : self.i+1].sent + if 'sent' in self.doc.user_token_hooks: + return self.doc.user_token_hooks["sent"](self) + return self.doc[self.i : self.i+1].sent property sent_start: def __get__(self): @@ -479,7 +479,8 @@ cdef class Token: else: raise ValueError(Errors.E044.format(value=value)) - property lefts: + @property + def lefts(self): """The leftward immediate children of the word, in the syntactic dependency parse. @@ -487,19 +488,19 @@ cdef class Token: DOCS: https://spacy.io/api/token#lefts """ - def __get__(self): - cdef int nr_iter = 0 - cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) - while ptr < self.c: - if ptr + ptr.head == self.c: - yield self.doc[ptr - (self.c - self.i)] - ptr += 1 - nr_iter += 1 - # This is ugly, but it's a way to guard out infinite loops - if nr_iter >= 10000000: - raise RuntimeError(Errors.E045.format(attr="token.lefts")) + cdef int nr_iter = 0 + cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) + while ptr < self.c: + if ptr + ptr.head == self.c: + yield self.doc[ptr - (self.c - self.i)] + ptr += 1 + nr_iter += 1 + # This is ugly, but it's a way to guard out infinite loops + if nr_iter >= 10000000: + raise RuntimeError(Errors.E045.format(attr="token.lefts")) - property rights: + @property + def rights(self): """The rightward immediate children of the word, in the syntactic dependency parse. @@ -507,33 +508,33 @@ cdef class Token: DOCS: https://spacy.io/api/token#rights """ - def __get__(self): - cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) - tokens = [] - cdef int nr_iter = 0 - while ptr > self.c: - if ptr + ptr.head == self.c: - tokens.append(self.doc[ptr - (self.c - self.i)]) - ptr -= 1 - nr_iter += 1 - if nr_iter >= 10000000: - raise RuntimeError(Errors.E045.format(attr="token.rights")) - tokens.reverse() - for t in tokens: - yield t + cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) + tokens = [] + cdef int nr_iter = 0 + while ptr > self.c: + if ptr + ptr.head == self.c: + tokens.append(self.doc[ptr - (self.c - self.i)]) + ptr -= 1 + nr_iter += 1 + if nr_iter >= 10000000: + raise RuntimeError(Errors.E045.format(attr="token.rights")) + tokens.reverse() + for t in tokens: + yield t - property children: + @property + def children(self): """A sequence of the token's immediate syntactic children. YIELDS (Token): A child token such that `child.head==self`. DOCS: https://spacy.io/api/token#children """ - def __get__(self): - yield from self.lefts - yield from self.rights + yield from self.lefts + yield from self.rights - property subtree: + @property + def subtree(self): """A sequence containing the token and all the token's syntactic descendants. @@ -542,30 +543,30 @@ cdef class Token: DOCS: https://spacy.io/api/token#subtree """ - def __get__(self): - for word in self.lefts: - yield from word.subtree - yield self - for word in self.rights: - yield from word.subtree + for word in self.lefts: + yield from word.subtree + yield self + for word in self.rights: + yield from word.subtree - property left_edge: + @property + def left_edge(self): """The leftmost token of this token's syntactic descendents. RETURNS (Token): The first token such that `self.is_ancestor(token)`. """ - def __get__(self): - return self.doc[self.c.l_edge] + return self.doc[self.c.l_edge] - property right_edge: + @property + def right_edge(self): """The rightmost token of this token's syntactic descendents. RETURNS (Token): The last token such that `self.is_ancestor(token)`. """ - def __get__(self): - return self.doc[self.c.r_edge] + return self.doc[self.c.r_edge] - property ancestors: + @property + def ancestors(self): """A sequence of this token's syntactic ancestors. YIELDS (Token): A sequence of ancestor tokens such that @@ -573,15 +574,14 @@ cdef class Token: DOCS: https://spacy.io/api/token#ancestors """ - def __get__(self): - cdef const TokenC* head_ptr = self.c - # Guard against infinite loop, no token can have - # more ancestors than tokens in the tree. - cdef int i = 0 - while head_ptr.head != 0 and i < self.doc.length: - head_ptr += head_ptr.head - yield self.doc[head_ptr - (self.c - self.i)] - i += 1 + cdef const TokenC* head_ptr = self.c + # Guard against infinite loop, no token can have + # more ancestors than tokens in the tree. + cdef int i = 0 + while head_ptr.head != 0 and i < self.doc.length: + head_ptr += head_ptr.head + yield self.doc[head_ptr - (self.c - self.i)] + i += 1 def is_ancestor(self, descendant): """Check whether this token is a parent, grandparent, etc. of another @@ -685,23 +685,23 @@ cdef class Token: # Set new head self.c.head = rel_newhead_i - property conjuncts: + @property + def conjuncts(self): """A sequence of coordinated tokens, including the token itself. YIELDS (Token): A coordinated token. DOCS: https://spacy.io/api/token#conjuncts """ - def __get__(self): - cdef Token word - if "conjuncts" in self.doc.user_token_hooks: - yield from self.doc.user_token_hooks["conjuncts"](self) - else: - if self.dep != conj: - for word in self.rights: - if word.dep == conj: - yield word - yield from word.conjuncts + cdef Token word + if "conjuncts" in self.doc.user_token_hooks: + yield from self.doc.user_token_hooks["conjuncts"](self) + else: + if self.dep != conj: + for word in self.rights: + if word.dep == conj: + yield word + yield from word.conjuncts property ent_type: """RETURNS (uint64): Named entity type.""" @@ -711,15 +711,6 @@ cdef class Token: def __set__(self, ent_type): self.c.ent_type = ent_type - property ent_iob: - """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag - is assigned. - - RETURNS (uint64): IOB code of named entity tag. - """ - def __get__(self): - return self.c.ent_iob - property ent_type_: """RETURNS (unicode): Named entity type.""" def __get__(self): @@ -728,16 +719,25 @@ cdef class Token: def __set__(self, ent_type): self.c.ent_type = self.vocab.strings.add(ent_type) - property ent_iob_: + @property + def ent_iob(self): + """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag + is assigned. + + RETURNS (uint64): IOB code of named entity tag. + """ + return self.c.ent_iob + + @property + def ent_iob_(self): """IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. RETURNS (unicode): IOB code of named entity tag. """ - def __get__(self): - iob_strings = ("", "I", "O", "B") - return iob_strings[self.c.ent_iob] + iob_strings = ("", "I", "O", "B") + return iob_strings[self.c.ent_iob] property ent_id: """RETURNS (uint64): ID of the entity the token is an instance of, @@ -759,26 +759,25 @@ cdef class Token: def __set__(self, name): self.c.ent_id = self.vocab.strings.add(name) - property whitespace_: - """RETURNS (unicode): The trailing whitespace character, if present. - """ - def __get__(self): - return " " if self.c.spacy else "" + @property + def whitespace_(self): + """RETURNS (unicode): The trailing whitespace character, if present.""" + return " " if self.c.spacy else "" - property orth_: + @property + def orth_(self): """RETURNS (unicode): Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. """ - def __get__(self): - return self.vocab.strings[self.c.lex.orth] + return self.vocab.strings[self.c.lex.orth] - property lower_: + @property + def lower_(self): """RETURNS (unicode): The lowercase token text. Equivalent to `Token.text.lower()`. """ - def __get__(self): - return self.vocab.strings[self.c.lex.lower] + return self.vocab.strings[self.c.lex.lower] property norm_: """RETURNS (unicode): The token's norm, i.e. a normalised form of the @@ -791,33 +790,33 @@ cdef class Token: def __set__(self, unicode norm_): self.c.norm = self.vocab.strings.add(norm_) - property shape_: + @property + def shape_(self): """RETURNS (unicode): Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". """ - def __get__(self): - return self.vocab.strings[self.c.lex.shape] + return self.vocab.strings[self.c.lex.shape] - property prefix_: + @property + def prefix_(self): """RETURNS (unicode): A length-N substring from the start of the token. Defaults to `N=1`. """ - def __get__(self): - return self.vocab.strings[self.c.lex.prefix] + return self.vocab.strings[self.c.lex.prefix] - property suffix_: + @property + def suffix_(self): """RETURNS (unicode): A length-N substring from the end of the token. Defaults to `N=3`. """ - def __get__(self): - return self.vocab.strings[self.c.lex.suffix] + return self.vocab.strings[self.c.lex.suffix] - property lang_: + @property + def lang_(self): """RETURNS (unicode): Language of the parent document's vocabulary, e.g. 'en'. """ - def __get__(self): - return self.vocab.strings[self.c.lex.lang] + return self.vocab.strings[self.c.lex.lang] property lemma_: """RETURNS (unicode): The token lemma, i.e. the base form of the word, @@ -856,110 +855,110 @@ cdef class Token: def __set__(self, unicode label): self.c.dep = self.vocab.strings.add(label) - property is_oov: + @property + def is_oov(self): """RETURNS (bool): Whether the token is out-of-vocabulary.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_OOV) + return Lexeme.c_check_flag(self.c.lex, IS_OOV) - property is_stop: + @property + def is_stop(self): """RETURNS (bool): Whether the token is a stop word, i.e. part of a "stop list" defined by the language data. """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_STOP) + return Lexeme.c_check_flag(self.c.lex, IS_STOP) - property is_alpha: + @property + def is_alpha(self): """RETURNS (bool): Whether the token consists of alpha characters. Equivalent to `token.text.isalpha()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) + return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) - property is_ascii: + @property + def is_ascii(self): """RETURNS (bool): Whether the token consists of ASCII characters. Equivalent to `[any(ord(c) >= 128 for c in token.text)]`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_ASCII) + return Lexeme.c_check_flag(self.c.lex, IS_ASCII) - property is_digit: + @property + def is_digit(self): """RETURNS (bool): Whether the token consists of digits. Equivalent to `token.text.isdigit()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) + return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) - property is_lower: + @property + def is_lower(self): """RETURNS (bool): Whether the token is in lowercase. Equivalent to `token.text.islower()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_LOWER) + return Lexeme.c_check_flag(self.c.lex, IS_LOWER) - property is_upper: + @property + def is_upper(self): """RETURNS (bool): Whether the token is in uppercase. Equivalent to `token.text.isupper()` """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_UPPER) + return Lexeme.c_check_flag(self.c.lex, IS_UPPER) - property is_title: + @property + def is_title(self): """RETURNS (bool): Whether the token is in titlecase. Equivalent to `token.text.istitle()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_TITLE) + return Lexeme.c_check_flag(self.c.lex, IS_TITLE) - property is_punct: + @property + def is_punct(self): """RETURNS (bool): Whether the token is punctuation.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) + return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) - property is_space: + @property + def is_space(self): """RETURNS (bool): Whether the token consists of whitespace characters. Equivalent to `token.text.isspace()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_SPACE) + return Lexeme.c_check_flag(self.c.lex, IS_SPACE) - property is_bracket: + @property + def is_bracket(self): """RETURNS (bool): Whether the token is a bracket.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) + return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) - property is_quote: + @property + def is_quote(self): """RETURNS (bool): Whether the token is a quotation mark.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) + return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) - property is_left_punct: + @property + def is_left_punct(self): """RETURNS (bool): Whether the token is a left punctuation mark.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) + return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) - property is_right_punct: + @property + def is_right_punct(self): """RETURNS (bool): Whether the token is a right punctuation mark.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) + return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) - property is_currency: + @property + def is_currency(self): """RETURNS (bool): Whether the token is a currency symbol.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY) + return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY) - property like_url: + @property + def like_url(self): """RETURNS (bool): Whether the token resembles a URL.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, LIKE_URL) + return Lexeme.c_check_flag(self.c.lex, LIKE_URL) - property like_num: + @property + def like_num(self): """RETURNS (bool): Whether the token resembles a number, e.g. "10.9", "10", "ten", etc. """ - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) + return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) - property like_email: + @property + def like_email(self): """RETURNS (bool): Whether the token resembles an email address.""" - def __get__(self): - return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) + return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 90e7dca34..e64394ee8 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -60,12 +60,12 @@ cdef class Vocab: self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.vectors = Vectors() - property lang: - def __get__(self): - langfunc = None - if self.lex_attr_getters: - langfunc = self.lex_attr_getters.get(LANG, None) - return langfunc("_") if langfunc else "" + @property + def lang(self): + langfunc = None + if self.lex_attr_getters: + langfunc = self.lex_attr_getters.get(LANG, None) + return langfunc("_") if langfunc else "" property writing_system: """A dict with information about the language's writing system. To get From b0b990e4050a5d7dc4d21ad5092d199d000f256c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 11 Mar 2019 17:05:45 +0100 Subject: [PATCH 13/27] Fix token.conjuncts (closes #795) (#3392) * Implement conjuncts method * Add span.conjuncts property * Un-xfail token.conjuncts tests * Update docs for token.conjuncts and span.conjuncts * Fix merge error in token.conjuncts --- spacy/tests/doc/test_token_api.py | 3 --- spacy/tokens/span.pyx | 10 ++++++++++ spacy/tokens/token.pyx | 26 +++++++++++++++++--------- website/docs/api/span.md | 18 ++++++++++++++++++ website/docs/api/token.md | 4 ++-- 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 9e3f88a48..bff2a95c6 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -201,7 +201,6 @@ def test_token0_has_sent_start_true(): assert not doc.is_sentenced -@pytest.mark.xfail def test_token_api_conjuncts_chain(en_vocab): words = "The boy and the girl and the man went .".split() heads = [1, 7, -1, 1, -3, -1, 1, -3, 0, -1] @@ -212,7 +211,6 @@ def test_token_api_conjuncts_chain(en_vocab): assert [w.text for w in doc[7].conjuncts] == ["boy", "girl"] -@pytest.mark.xfail def test_token_api_conjuncts_simple(en_vocab): words = "They came and went .".split() heads = [1, 0, -1, -2, -1] @@ -222,7 +220,6 @@ def test_token_api_conjuncts_simple(en_vocab): assert [w.text for w in doc[3].conjuncts] == ["came"] -@pytest.mark.xfail def test_token_api_non_conjuncts(en_vocab): words = "They came .".split() heads = [1, 0, -1] diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index d38d6e0fc..b51ca3e57 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -527,6 +527,16 @@ cdef class Span: else: return self.doc[root] + @property + def conjuncts(self): + """Tokens that are conjoined to the span's root. + + RETURNS (tuple): A tuple of Token objects. + + DOCS: https://spacy.io/api/span#lefts + """ + return self.root.conjuncts + @property def lefts(self): """Tokens that are to the left of the span, whose head is within the diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 960679e9c..409b68290 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -689,19 +689,27 @@ cdef class Token: def conjuncts(self): """A sequence of coordinated tokens, including the token itself. - YIELDS (Token): A coordinated token. + RETURNS (tuple): The coordinated tokens. DOCS: https://spacy.io/api/token#conjuncts """ - cdef Token word + cdef Token word, child if "conjuncts" in self.doc.user_token_hooks: - yield from self.doc.user_token_hooks["conjuncts"](self) - else: - if self.dep != conj: - for word in self.rights: - if word.dep == conj: - yield word - yield from word.conjuncts + return tuple(self.doc.user_token_hooks["conjuncts"](self)) + start = self + while start.i != start.head.i: + if start.dep == conj: + start = start.head + else: + break + queue = [start] + output = [start] + for word in queue: + for child in word.rights: + if child.c.dep == conj: + output.append(child) + queue.append(child) + return tuple([w for w in output if w.i != self.i]) property ent_type: """RETURNS (uint64): Named entity type.""" diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 033aa579c..5445f13df 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -316,6 +316,24 @@ taken. | ----------- | ------- | --------------- | | **RETURNS** | `Token` | The root token. | +## Span.conjuncts {#conjuncts tag="property" model="parser"} + +A tuple of tokens coordinated to `span.root`. + +> #### Example +> +> ```python +> doc = nlp(u"I like apples and oranges") +> apples_conjuncts = doc[2:3].conjuncts +> assert [t.text for t in apples_conjuncts] == [u"oranges"] +> ``` + +| Name | Type | Description | +| ---------- | ------- | -------------------- | +| **RETURNS** | `tuple` | The coordinated tokens. | + + + ## Span.lefts {#lefts tag="property" model="parser"} Tokens that are to the left of the span, whose heads are within the span. diff --git a/website/docs/api/token.md b/website/docs/api/token.md index f30fd4639..6981f0fc7 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -211,7 +211,7 @@ The rightmost token of this token's syntactic descendants. ## Token.conjuncts {#conjuncts tag="property" model="parser"} -A sequence of coordinated tokens, including the token itself. +A tuple of coordinated tokens, not including the token itself. > #### Example > @@ -223,7 +223,7 @@ A sequence of coordinated tokens, including the token itself. | Name | Type | Description | | ---------- | ------- | -------------------- | -| **YIELDS** | `Token` | A coordinated token. | +| **RETURNS** | `tuple` | The coordinated tokens. | ## Token.children {#children tag="property" model="parser"} From cdd418b93edf0d485819d44e4481ca3a0b5adbe7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 17:10:50 +0100 Subject: [PATCH 14/27] Auto-format [ci skip] --- spacy/lang/he/__init__.py | 1 + spacy/lang/zh/__init__.py | 1 + spacy/util.py | 2 +- website/docs/api/span.md | 6 ++---- website/docs/api/token.md | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index 0ad65a0b4..411cdf107 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -16,6 +16,7 @@ class HebrewDefaults(Language.Defaults): stop_words = STOP_WORDS writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} + class Hebrew(Language): lang = "he" Defaults = HebrewDefaults diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 708e446ba..773bbcf38 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -16,6 +16,7 @@ class ChineseDefaults(Language.Defaults): stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + class Chinese(Language): lang = "zh" Defaults = ChineseDefaults # override defaults diff --git a/spacy/util.py b/spacy/util.py index 137d466d5..7a36fe958 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -48,7 +48,7 @@ def lang_class_is_loaded(lang): """ global LANGUAGES return lang in LANGUAGES - + def get_lang_class(lang): """Import and load a Language class. diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 5445f13df..43924a2b5 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -328,12 +328,10 @@ A tuple of tokens coordinated to `span.root`. > assert [t.text for t in apples_conjuncts] == [u"oranges"] > ``` -| Name | Type | Description | -| ---------- | ------- | -------------------- | +| Name | Type | Description | +| ----------- | ------- | ----------------------- | | **RETURNS** | `tuple` | The coordinated tokens. | - - ## Span.lefts {#lefts tag="property" model="parser"} Tokens that are to the left of the span, whose heads are within the span. diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 6981f0fc7..aa55d97c1 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -221,8 +221,8 @@ A tuple of coordinated tokens, not including the token itself. > assert [t.text for t in apples_conjuncts] == [u"oranges"] > ``` -| Name | Type | Description | -| ---------- | ------- | -------------------- | +| Name | Type | Description | +| ----------- | ------- | ----------------------- | | **RETURNS** | `tuple` | The coordinated tokens. | ## Token.children {#children tag="property" model="parser"} From 4bd2688eac26842576bab8918bbf610963e642d3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 18:52:50 +0100 Subject: [PATCH 15/27] =?UTF-8?q?=F0=9F=92=AB=20Fix=20displaCy=20support?= =?UTF-8?q?=20for=20RTL=20languages=20(#3393)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #2091. ## Description With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes. Entity visualization now looks like this: Screenshot 2019-03-11 at 16 06 51 And dependencies like this (ignore the most likely incorrect tags and dependencies): Screenshot 2019-03-11 at 16 51 59 ### Types of change enhancement, bug fix ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/displacy/__init__.py | 12 ++++++-- spacy/displacy/render.py | 58 +++++++++++++++++++++++++++--------- spacy/displacy/templates.py | 17 +++++++---- spacy/tests/test_displacy.py | 20 +++++++++++++ 4 files changed, 86 insertions(+), 21 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 6c5509b14..fadbaaa7e 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -161,7 +161,7 @@ def parse_deps(orig_doc, options={}): "dir": "right", } ) - return {"words": words, "arcs": arcs} + return {"words": words, "arcs": arcs, "settings": get_doc_settings(orig_doc)} def parse_ents(doc, options={}): @@ -177,7 +177,8 @@ def parse_ents(doc, options={}): if not ents: user_warning(Warnings.W006) title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None - return {"text": doc.text, "ents": ents, "title": title} + settings = get_doc_settings(doc) + return {"text": doc.text, "ents": ents, "title": title, "settings": settings} def set_render_wrapper(func): @@ -195,3 +196,10 @@ def set_render_wrapper(func): if not hasattr(func, "__call__"): raise ValueError(Errors.E110.format(obj=type(func))) RENDER_WRAPPER = func + + +def get_doc_settings(doc): + return { + "lang": doc.lang_, + "direction": doc.vocab.writing_system.get("direction", "ltr"), + } diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 2b8e0c7d2..500e49989 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals import uuid -from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS -from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE +from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS +from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from ..util import minify_html, escape_html +DEFAULT_LANG = "en" +DEFAULT_DIR = "ltr" + class DependencyRenderer(object): """Render dependency parses as SVGs.""" @@ -30,6 +33,8 @@ class DependencyRenderer(object): self.color = options.get("color", "#000000") self.bg = options.get("bg", "#ffffff") self.font = options.get("font", "Arial") + self.direction = DEFAULT_DIR + self.lang = DEFAULT_LANG def render(self, parsed, page=False, minify=False): """Render complete markup. @@ -42,13 +47,19 @@ class DependencyRenderer(object): # Create a random ID prefix to make sure parses don't receive the # same ID, even if they're identical id_prefix = uuid.uuid4().hex - rendered = [ - self.render_svg("{}-{}".format(id_prefix, i), p["words"], p["arcs"]) - for i, p in enumerate(parsed) - ] + rendered = [] + for i, p in enumerate(parsed): + if i == 0: + self.direction = p["settings"].get("direction", DEFAULT_DIR) + self.lang = p["settings"].get("lang", DEFAULT_LANG) + render_id = "{}-{}".format(id_prefix, i) + svg = self.render_svg(render_id, p["words"], p["arcs"]) + rendered.append(svg) if page: content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered]) - markup = TPL_PAGE.format(content=content) + markup = TPL_PAGE.format( + content=content, lang=self.lang, dir=self.direction + ) else: markup = "".join(rendered) if minify: @@ -83,6 +94,8 @@ class DependencyRenderer(object): bg=self.bg, font=self.font, content=content, + dir=self.direction, + lang=self.lang, ) def render_word(self, text, tag, i): @@ -95,11 +108,13 @@ class DependencyRenderer(object): """ y = self.offset_y + self.word_spacing x = self.offset_x + i * self.distance + if self.direction == "rtl": + x = self.width - x html_text = escape_html(text) return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y) def render_arrow(self, label, start, end, direction, i): - """Render indivicual arrow. + """Render individual arrow. label (unicode): Dependency label. start (int): Index of start word. @@ -110,6 +125,8 @@ class DependencyRenderer(object): """ level = self.levels.index(end - start) + 1 x_start = self.offset_x + start * self.distance + self.arrow_spacing + if self.direction == "rtl": + x_start = self.width - x_start y = self.offset_y x_end = ( self.offset_x @@ -117,6 +134,8 @@ class DependencyRenderer(object): + start * self.distance - self.arrow_spacing * (self.highest_level - level) / 4 ) + if self.direction == "rtl": + x_end = self.width - x_end y_curve = self.offset_y - level * self.distance / 2 if self.compact: y_curve = self.offset_y - level * self.distance / 6 @@ -124,12 +143,14 @@ class DependencyRenderer(object): y_curve = -self.distance arrowhead = self.get_arrowhead(direction, x_start, y, x_end) arc = self.get_arc(x_start, y, y_curve, x_end) + label_side = "right" if self.direction == "rtl" else "left" return TPL_DEP_ARCS.format( id=self.id, i=i, stroke=self.arrow_stroke, head=arrowhead, label=label, + label_side=label_side, arc=arc, ) @@ -219,6 +240,8 @@ class EntityRenderer(object): self.default_color = "#ddd" self.colors = colors self.ents = options.get("ents", None) + self.direction = DEFAULT_DIR + self.lang = DEFAULT_LANG def render(self, parsed, page=False, minify=False): """Render complete markup. @@ -228,12 +251,15 @@ class EntityRenderer(object): minify (bool): Minify HTML markup. RETURNS (unicode): Rendered HTML markup. """ - rendered = [ - self.render_ents(p["text"], p["ents"], p.get("title", None)) for p in parsed - ] + rendered = [] + for i, p in enumerate(parsed): + if i == 0: + self.direction = p["settings"].get("direction", DEFAULT_DIR) + self.lang = p["settings"].get("lang", DEFAULT_LANG) + rendered.append(self.render_ents(p["text"], p["ents"], p["title"])) if page: docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered]) - markup = TPL_PAGE.format(content=docs) + markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction) else: markup = "".join(rendered) if minify: @@ -261,12 +287,16 @@ class EntityRenderer(object): markup += "
" if self.ents is None or label.upper() in self.ents: color = self.colors.get(label.upper(), self.default_color) - markup += TPL_ENT.format(label=label, text=entity, bg=color) + ent_settings = {"label": label, "text": entity, "bg": color} + if self.direction == "rtl": + markup += TPL_ENT_RTL.format(**ent_settings) + else: + markup += TPL_ENT.format(**ent_settings) else: markup += entity offset = end markup += escape_html(text[offset:]) - markup = TPL_ENTS.format(content=markup, colors=self.colors) + markup = TPL_ENTS.format(content=markup, dir=self.direction) if title: markup = TPL_TITLE.format(title=title) + markup return markup diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index f0922b1e3..4a7c596d8 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -6,7 +6,7 @@ from __future__ import unicode_literals # Jupyter to render it properly in a cell TPL_DEP_SVG = """ -{content} +{content} """ @@ -22,7 +22,7 @@ TPL_DEP_ARCS = """ - {label} + {label} @@ -39,7 +39,7 @@ TPL_TITLE = """ TPL_ENTS = """ -
{content}
+
{content}
""" @@ -50,14 +50,21 @@ TPL_ENT = """ """ +TPL_ENT_RTL = """ + + {text} + {label} + +""" + TPL_PAGE = """ - + displaCy - {content} + {content} """ diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index a65060ea7..24e45bfc1 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import pytest from spacy import displacy from spacy.tokens import Span +from spacy.lang.fa import Persian from .util import get_doc @@ -66,3 +67,22 @@ def test_displacy_render_wrapper(en_vocab): def test_displacy_raises_for_wrong_type(en_vocab): with pytest.raises(ValueError): displacy.render("hello world") + + +def test_displacy_rtl(): + # Source: http://www.sobhe.ir/hazm/ – is this correct? + words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"] + # These are (likely) wrong, but it's just for testing + pos = ["PRO", "ADV", "N_PL", "V_SUB"] # needs to match lang.fa.tag_map + deps = ["foo", "bar", "foo", "baz"] + heads = [1, 0, 1, -2] + nlp = Persian() + doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps) + doc.ents = [Span(doc, 1, 3, label="TEST")] + html = displacy.render(doc, page=True, style="dep") + assert "direction: rtl" in html + assert 'direction="rtl"' in html + assert 'lang="{}"'.format(nlp.lang) in html + html = displacy.render(doc, page=True, style="ent") + assert "direction: rtl" in html + assert 'lang="{}"'.format(nlp.lang) in html From 886e5966c074039b805438e8947a1e5b0bac7da9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 19:03:52 +0100 Subject: [PATCH 16/27] Update test_displacy.py --- spacy/tests/test_displacy.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 24e45bfc1..6d82bbbaa 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -50,20 +50,6 @@ def test_displacy_spans(en_vocab): assert html.startswith("TEST") - - def test_displacy_raises_for_wrong_type(en_vocab): with pytest.raises(ValueError): displacy.render("hello world") @@ -86,3 +72,18 @@ def test_displacy_rtl(): html = displacy.render(doc, page=True, style="ent") assert "direction: rtl" in html assert 'lang="{}"'.format(nlp.lang) in html + +def test_displacy_render_wrapper(en_vocab): + """Test that displaCy accepts custom rendering wrapper.""" + + def wrapper(html): + return "TEST" + html + "TEST" + + displacy.set_render_wrapper(wrapper) + doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] + html = displacy.render(doc, style="ent") + assert html.startswith("TESTTEST") + # Restore + displacy.set_render_wrapper(lambda html: html) \ No newline at end of file From 062934aa12179f2e693e022520b1dec7ef1ba45b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 11 Mar 2019 22:26:19 +0100 Subject: [PATCH 17/27] Set version to v2.1.0a12 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 4b48d4d4d..53113e196 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "2.1.0a11" +__version__ = "2.1.0a12" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From 70e5058d7267c7a68d56040df276ea01b3d66ebb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 11 Mar 2019 22:27:10 +0100 Subject: [PATCH 18/27] Update push-tag script --- bin/push-tag.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/push-tag.sh b/bin/push-tag.sh index 07ebc1bf0..57133499c 100755 --- a/bin/push-tag.sh +++ b/bin/push-tag.sh @@ -7,6 +7,7 @@ git diff-index --quiet HEAD git checkout $1 git pull origin $1 +git push origin $1 version=$(grep "__version__ = " spacy/about.py) version=${version/__version__ = } @@ -15,4 +16,4 @@ version=${version/\'/} version=${version/\"/} version=${version/\"/} git tag "v$version" -git push origin --tags +git push origin "v$version" --tags From 1664d1fa62185fa21e8141678f7740df2b37b7e9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 12 Mar 2019 11:13:03 +0100 Subject: [PATCH 19/27] Update universe [ci skip] --- website/meta/universe.json | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index d4c24eceb..e81654a07 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -524,6 +524,22 @@ }, "category": ["standalone", "research"] }, + { + "id": "scispacy", + "title": "scispaCy", + "slogan": "A full spaCy pipeline and models for scientific/biomedical documents", + "github": "allenai/scispacy", + "pip": "scispacy", + "thumb": "https://i.imgur.com/dJQSclW.png", + "url": "https://allenai.github.io/scispacy/", + "author": " Allen Institute for Artificial Intelligence", + "author_links": { + "github": "allenai", + "twitter": "allenai_org", + "website": "http://allenai.org" + }, + "category": ["models", "research"] + }, { "id": "textacy", "slogan": "NLP, before and after spaCy", @@ -851,6 +867,22 @@ }, "category": ["courses"] }, + { + "type": "education", + "id": "datacamp-advanced-nlp", + "title": "Advanced Natural Language Processing with spaCy", + "slogan": "Datacamp, 2019", + "description": "If you're working with a lot of text, you'll eventually want to know more about it. For example, what's it about? What do the words mean in context? Who is doing what to whom? What companies and products are mentioned? Which texts are similar to each other? In this course, you'll learn how to use spaCy, a fast-growing industry standard library for NLP in Python, to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", + "url": "https://www.datacamp.com/courses/advanced-nlp-with-spacy", + "thumb": "https://i.imgur.com/0Zks7c0.jpg", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + }, + "category": ["courses"] + }, { "type": "education", "id": "learning-path-spacy", @@ -910,6 +942,7 @@ "description": "Most NLP projects rely crucially on the quality of annotations used for training and evaluating models. In this episode, Matt and Ines of Explosion AI tell us how Prodigy can improve data annotation and model development workflows. Prodigy is an annotation tool implemented as a python library, and it comes with a web application and a command line interface. A developer can define input data streams and design simple annotation interfaces. Prodigy can help break down complex annotation decisions into a series of binary decisions, and it provides easy integration with spaCy models. Developers can specify how models should be modified as new annotations come in in an active learning framework.", "soundcloud": "559200912", "thumb": "https://i.imgur.com/hOBQEzc.jpg", + "url": "https://soundcloud.com/nlp-highlights/78-where-do-corpora-come-from-with-matt-honnibal-and-ines-montani", "author": "Matt Gardner, Waleed Ammar (Allen AI)", "author_links": { "website": "https://soundcloud.com/nlp-highlights" @@ -925,12 +958,28 @@ "iframe": "https://www.pythonpodcast.com/wp-content/plugins/podlove-podcasting-plugin-for-wordpress/lib/modules/podlove_web_player/player_v4/dist/share.html?episode=https://www.pythonpodcast.com/?podlove_player4=176", "iframe_height": 200, "thumb": "https://i.imgur.com/rpo6BuY.png", + "url": "https://www.podcastinit.com/episode-87-spacy-with-matthew-honnibal/", "author": "Tobias Macey", "author_links": { "website": "https://www.podcastinit.com" }, "category": ["podcasts"] }, + { + "type": "education", + "id": "talk-python-podcast", + "title": "Talk Python 202: Building a software business", + "slogan": "March 2019", + "description": "One core question around open source is how do you fund it? Well, there is always that PayPal donate button. But that's been a tremendous failure for many projects. Often the go-to answer is consulting. But what if you don't want to trade time for money? You could take things up a notch and change the equation, exchanging value for money. That's what Ines Montani and her co-founder did when they started Explosion AI with spaCy as the foundation.", + "thumb": "https://i.imgur.com/q1twuK8.png", + "url": "https://talkpython.fm/episodes/show/202/building-a-software-business", + "soundcloud": "588364857", + "author": "Michael Kennedy", + "author_links": { + "website": "https://talkpython.fm/" + }, + "category": ["podcasts"] + }, { "id": "adam_qas", "title": "ADAM: Question Answering System", From 59c06204874321ce3c1a296d2f7ac309ad9615ad Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 12 Mar 2019 12:07:11 +0100 Subject: [PATCH 20/27] Auto-format --- website/docs/usage/adding-languages.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 6718bc7fd..405e21f4d 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -105,11 +105,11 @@ to know the language's character set. If the language you're adding uses non-latin characters, you might need to define the required character classes in the global [`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py). -For efficiency, spaCy uses hard-coded unicode ranges to define character classes, -the definitions of which can be found on [Wikipedia](https://en.wikipedia.org/wiki/Unicode_block). -If the language requires very specific punctuation -rules, you should consider overwriting the default regular expressions with your -own in the language's `Defaults`. +For efficiency, spaCy uses hard-coded unicode ranges to define character +classes, the definitions of which can be found on +[Wikipedia](https://en.wikipedia.org/wiki/Unicode_block). If the language +requires very specific punctuation rules, you should consider overwriting the +default regular expressions with your own in the language's `Defaults`. From 3abf0e6b9fadee922d52e99f11a229eef27dd390 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 12 Mar 2019 12:07:40 +0100 Subject: [PATCH 21/27] Replace dev-resources links with real examples --- website/docs/usage/101/_language-data.md | 10 +++++----- website/docs/usage/adding-languages.md | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/website/docs/usage/101/_language-data.md b/website/docs/usage/101/_language-data.md index 782fc7e5d..6834f884f 100644 --- a/website/docs/usage/101/_language-data.md +++ b/website/docs/usage/101/_language-data.md @@ -39,9 +39,9 @@ together all components and creating the `Language` subclass – for example, | **Morph rules**
[`morph_rules.py`][morph_rules.py] | Exception rules for morphological analysis of irregular words like personal pronouns. | [stop_words.py]: - https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/stop_words.py + https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py [tokenizer_exceptions.py]: - https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/tokenizer_exceptions.py + https://github.com/explosion/spaCy/tree/master/spacy/lang/de/tokenizer_exceptions.py [norm_exceptions.py]: https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py [punctuation.py]: @@ -49,12 +49,12 @@ together all components and creating the `Language` subclass – for example, [char_classes.py]: https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py [lex_attrs.py]: - https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/lex_attrs.py + https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py [syntax_iterators.py]: https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py [lemmatizer.py]: - https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/lemmatizer.py + https://github.com/explosion/spaCy/tree/master/spacy/lang/de/lemmatizer.py [tag_map.py]: - https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/tag_map.py + https://github.com/explosion/spaCy/tree/master/spacy/lang/en/tag_map.py [morph_rules.py]: https://github.com/explosion/spaCy/tree/master/spacy/lang/en/morph_rules.py diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 405e21f4d..a66d71d26 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -121,9 +121,9 @@ spaCy, named according to the language's code and resources specific to Spanish are placed into a directory `spacy/lang/es`, which can be imported as `spacy.lang.es`. -To get started, you can use our -[templates](https://github.com/explosion/spacy-dev-resources/templates/new_language) -for the most important files. Here's what the class template looks like: +To get started, you can check out the +[existing languages](https://github.com/explosion/spacy/tree/master/spacy/lang). +Here's what the class could look like: ```python ### __init__.py (excerpt) From 72fb324d954207623030579fcdeda4e3a2e0b8cf Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 12 Mar 2019 12:07:56 +0100 Subject: [PATCH 22/27] Add vector training script to bin [ci skip] --- bin/train_word_vectors.py | 107 +++++++++++++++++++++++++ website/docs/usage/adding-languages.md | 4 +- 2 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 bin/train_word_vectors.py diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py new file mode 100644 index 000000000..8482a7a55 --- /dev/null +++ b/bin/train_word_vectors.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +from __future__ import print_function, unicode_literals, division + +import logging +from pathlib import Path +from collections import defaultdict +from gensim.models import Word2Vec +from preshed.counter import PreshCounter +import plac +import spacy + +logger = logging.getLogger(__name__) + + +class Corpus(object): + def __init__(self, directory, min_freq=10): + self.directory = directory + self.counts = PreshCounter() + self.strings = {} + self.min_freq = min_freq + + def count_doc(self, doc): + # Get counts for this document + for word in doc: + self.counts.inc(word.orth, 1) + return len(doc) + + def __iter__(self): + for text_loc in iter_dir(self.directory): + with text_loc.open("r", encoding="utf-8") as file_: + text = file_.read() + yield text + + +def iter_dir(loc): + dir_path = Path(loc) + for fn_path in dir_path.iterdir(): + if fn_path.is_dir(): + for sub_path in fn_path.iterdir(): + yield sub_path + else: + yield fn_path + + +@plac.annotations( + lang=("ISO language code"), + in_dir=("Location of input directory"), + out_loc=("Location of output file"), + n_workers=("Number of workers", "option", "n", int), + size=("Dimension of the word vectors", "option", "d", int), + window=("Context window size", "option", "w", int), + min_count=("Min count", "option", "m", int), + negative=("Number of negative samples", "option", "g", int), + nr_iter=("Number of iterations", "option", "i", int), +) +def main( + lang, + in_dir, + out_loc, + negative=5, + n_workers=4, + window=5, + size=128, + min_count=10, + nr_iter=2, +): + logging.basicConfig( + format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO + ) + model = Word2Vec( + size=size, + window=window, + min_count=min_count, + workers=n_workers, + sample=1e-5, + negative=negative, + ) + nlp = spacy.blank(lang) + corpus = Corpus(in_dir) + total_words = 0 + total_sents = 0 + for text_no, text_loc in enumerate(iter_dir(corpus.directory)): + with text_loc.open("r", encoding="utf-8") as file_: + text = file_.read() + total_sents += text.count("\n") + doc = nlp(text) + total_words += corpus.count_doc(doc) + logger.info( + "PROGRESS: at batch #%i, processed %i words, keeping %i word types", + text_no, + total_words, + len(corpus.strings), + ) + model.corpus_count = total_sents + model.raw_vocab = defaultdict(int) + for orth, freq in corpus.counts: + if freq >= min_count: + model.raw_vocab[nlp.vocab.strings[orth]] = freq + model.scale_vocab() + model.finalize_vocab() + model.iter = nr_iter + model.train(corpus) + model.save(out_loc) + + +if __name__ == "__main__": + plac.call(main) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index a66d71d26..236df6402 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -631,13 +631,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also useful by themselves – they power the `.similarity` methods in spaCy. For best results, you should pre-process the text with spaCy before training the Word2vec model. This ensures your tokenization will match. You can use our -[word vectors training script](https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py), +[word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py), which pre-processes the text with your language-specific tokenizer and trains the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin` file should consist of one word and vector per line. ```python -https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py +https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py ``` If you don't have a large sample of text available, you can also convert word From 2912ddc9a60ebcb11d3bcfc0747c6089d8382a00 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 12 Mar 2019 13:30:33 +0100 Subject: [PATCH 23/27] Don't set extension attribute in Japanese (closes #3398) --- spacy/lang/ja/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index e35967409..3a6074bba 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -8,16 +8,13 @@ from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from ...attrs import LANG from ...language import Language -from ...tokens import Doc, Token +from ...tokens import Doc from ...compat import copy_reg from ...util import DummyTokenizer ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) -# TODO: Is this the right place for this? -Token.set_extension("mecab_tag", default=None) - def try_mecab_import(): """Mecab is required for Japanese support, so check for it. @@ -82,10 +79,12 @@ class JapaneseTokenizer(DummyTokenizer): words = [x.surface for x in dtokens] spaces = [False] * len(words) doc = Doc(self.vocab, words=words, spaces=spaces) + mecab_tags = [] for token, dtoken in zip(doc, dtokens): - token._.mecab_tag = dtoken.pos + mecab_tags.append(dtoken.pos) token.tag_ = resolve_pos(dtoken) token.lemma_ = dtoken.lemma + doc.user_data["mecab_tags"] = mecab_tags return doc From 8a4121cbc2d48d24e8e3a2304f756d5977f7be5a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 12 Mar 2019 13:32:56 +0100 Subject: [PATCH 24/27] Fix bug introduced by component_cfg --- spacy/language.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index e97b74a77..6432f3e98 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -900,6 +900,11 @@ class DisabledPipes(list): def _pipe(func, docs, kwargs): + # We added some args for pipe that __call__ doesn't expect. + kwargs = dict(kwargs) + for arg in ["n_threads", "batch_size"]: + if arg in kwargs: + kwargs.pop(arg) for doc in docs: doc = func(doc, **kwargs) yield doc From 479b5cff430de8f85d7cba43a040110442a28c37 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 12 Mar 2019 13:35:34 +0100 Subject: [PATCH 25/27] Auto-format [ci skip] --- spacy/tests/test_displacy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 6d82bbbaa..2097fd081 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -73,6 +73,7 @@ def test_displacy_rtl(): assert "direction: rtl" in html assert 'lang="{}"'.format(nlp.lang) in html + def test_displacy_render_wrapper(en_vocab): """Test that displaCy accepts custom rendering wrapper.""" @@ -85,5 +86,5 @@ def test_displacy_render_wrapper(en_vocab): html = displacy.render(doc, style="ent") assert html.startswith("TESTTEST") - # Restore - displacy.set_render_wrapper(lambda html: html) \ No newline at end of file + # Restore + displacy.set_render_wrapper(lambda html: html) From 6aab2d8533f75ed1b86508464bfe6c5e931d176e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 12 Mar 2019 15:14:06 +0100 Subject: [PATCH 26/27] Set version to v2.1.0a13 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 53113e196..d2bfae028 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "2.1.0a12" +__version__ = "2.1.0a13" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From d842d5698e4df7f3c016eab572ef935435261691 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 12 Mar 2019 15:21:58 +0100 Subject: [PATCH 27/27] Tidy up website and add eslint config [ci skip] --- website/.eslintrc | 27 + website/package-lock.json | 828 ++------------------------- website/package.json | 2 +- website/src/components/accordion.js | 2 +- website/src/components/github.js | 3 +- website/src/components/progress.js | 27 +- website/src/components/quickstart.js | 20 +- website/src/components/search.js | 8 +- website/src/components/section.js | 2 +- website/src/components/sidebar.js | 4 +- website/src/templates/index.js | 60 +- website/src/templates/models.js | 10 +- website/src/widgets/changelog.js | 3 +- website/src/widgets/landing.js | 15 +- 14 files changed, 147 insertions(+), 864 deletions(-) create mode 100644 website/.eslintrc diff --git a/website/.eslintrc b/website/.eslintrc new file mode 100644 index 000000000..98d2c1608 --- /dev/null +++ b/website/.eslintrc @@ -0,0 +1,27 @@ +{ + "extends": ["standard", "prettier"], + "plugins": ["standard", "react", "react-hooks"], + "rules": { + "no-var": "error", + "no-unused-vars": 1, + "arrow-spacing": ["error", { "before": true, "after": true }], + "indent": ["error", 4], + "semi": ["error", "never"], + "arrow-parens": ["error", "as-needed"], + "standard/object-curly-even-spacing": ["error", "either"], + "standard/array-bracket-even-spacing": ["error", "either"], + "standard/computed-property-even-spacing": ["error", "even"], + "standard/no-callback-literal": ["error", ["cb", "callback"]], + "react/jsx-uses-react": "error", + "react/jsx-uses-vars": "error", + "react-hooks/rules-of-hooks": "error", + "react-hooks/exhaustive-deps": "warn" + }, + "parser": "babel-eslint", + "parserOptions": { + "ecmaVersion": 8 + }, + "env": { + "browser": true + } +} diff --git a/website/package-lock.json b/website/package-lock.json index e8d9a5abe..cb1731c1b 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -1833,9 +1833,9 @@ } }, "acorn": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.1.0.tgz", - "integrity": "sha512-MW/FjM+IvU9CgBzjO3UIPCE2pyEwUsoFl+VGdczOPEdxfGFjuKny/gN54mOuX7Qxmb9Rg9MCn2oKiSUeW+pjrw==" + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.1.1.tgz", + "integrity": "sha512-jPTiwtOxaHNaAPg/dmrJ/beuzLRnXtB0kQPQ8JpotKJgTB6rX6c8mlf315941pyjBSaPg8NHXS9fhP4u17DpGA==" }, "acorn-dynamic-import": { "version": "3.0.0", @@ -5958,9 +5958,9 @@ "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=" }, "eslint": { - "version": "5.14.1", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-5.14.1.tgz", - "integrity": "sha512-CyUMbmsjxedx8B0mr79mNOqetvkbij/zrXnFeK2zc3pGRn3/tibjiNAv/3UxFEyfMDjh+ZqTrJrEGBFiGfD5Og==", + "version": "5.15.1", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-5.15.1.tgz", + "integrity": "sha512-NTcm6vQ+PTgN3UBsALw5BMhgO6i5EpIjQF/Xb5tIh3sk9QhrFafujUOczGz4J24JBlzWclSB9Vmx8d+9Z6bFCg==", "requires": { "@babel/code-frame": "^7.0.0", "ajv": "^6.9.1", @@ -5968,7 +5968,7 @@ "cross-spawn": "^6.0.5", "debug": "^4.0.1", "doctrine": "^3.0.0", - "eslint-scope": "^4.0.0", + "eslint-scope": "^4.0.2", "eslint-utils": "^1.3.1", "eslint-visitor-keys": "^1.0.0", "espree": "^5.0.1", @@ -6001,9 +6001,9 @@ }, "dependencies": { "ajv": { - "version": "6.9.2", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.9.2.tgz", - "integrity": "sha512-4UFy0/LgDo7Oa/+wOAlj44tp9K78u38E5/359eSrqEp1Z5PdVfimCcs7SluXMP755RUQu6d2b4AvF0R1C9RZjg==", + "version": "6.10.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.10.0.tgz", + "integrity": "sha512-nffhOpkymDECQyR0mnsUtoCE8RlX38G0rYP+wgLWFyZuUyuuojSSvi/+euOiQBIn63whYwYVIIH1TvE3tu4OEg==", "requires": { "fast-deep-equal": "^2.0.1", "fast-json-stable-stringify": "^2.0.0", @@ -6037,9 +6037,9 @@ } }, "eslint-scope": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-4.0.0.tgz", - "integrity": "sha512-1G6UTDi7Jc1ELFwnR58HV4fK9OQK4S6N985f166xqXxpjU6plxFISJa2Ba9KCQuFa8RCnj/lSFJbHo7UFDBnUA==", + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-4.0.2.tgz", + "integrity": "sha512-5q1+B/ogmHl8+paxtOKx38Z8LtWkVGuNt3+GQNErqwLl6ViNp/gdJGMCjZNxZ8j/VYjDNZ2Fo+eQc1TAVPIzbg==", "requires": { "esrecurse": "^4.1.0", "estraverse": "^4.1.1" @@ -6448,52 +6448,6 @@ } } }, - "expand-range": { - "version": "1.8.2", - "resolved": "http://registry.npmjs.org/expand-range/-/expand-range-1.8.2.tgz", - "integrity": "sha1-opnv/TNf4nIeuujiV+x5ZE/IUzc=", - "requires": { - "fill-range": "^2.1.0" - }, - "dependencies": { - "fill-range": { - "version": "2.2.4", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-2.2.4.tgz", - "integrity": "sha512-cnrcCbj01+j2gTG921VZPnHbjmdAf8oQV/iGeV2kZxGSyfYjjTyY79ErsK1WJWMpw6DaApEX72binqJE+/d+5Q==", - "requires": { - "is-number": "^2.1.0", - "isobject": "^2.0.0", - "randomatic": "^3.0.0", - "repeat-element": "^1.1.2", - "repeat-string": "^1.5.2" - } - }, - "is-number": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-2.1.0.tgz", - "integrity": "sha1-Afy7s5NGOlSPL0ZszhbezknbkI8=", - "requires": { - "kind-of": "^3.0.2" - } - }, - "isobject": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", - "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", - "requires": { - "isarray": "1.0.0" - } - }, - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, "expand-template": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", @@ -6818,11 +6772,6 @@ "resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz", "integrity": "sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==" }, - "filename-regex": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/filename-regex/-/filename-regex-2.0.1.tgz", - "integrity": "sha1-wcS5vuPglyXdsQa3XB4wH+LxiyY=" - }, "filename-reserved-regex": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/filename-reserved-regex/-/filename-reserved-regex-2.0.0.tgz", @@ -7130,468 +7079,6 @@ "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" }, - "fsevents": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-1.2.4.tgz", - "integrity": "sha512-z8H8/diyk76B7q5wg+Ud0+CqzcAF3mBBI/bA5ne5zrRUUIvNkJY//D3BqyH571KuAC4Nr7Rw7CjWX4r0y9DvNg==", - "optional": true, - "requires": { - "nan": "^2.9.2", - "node-pre-gyp": "^0.10.0" - }, - "dependencies": { - "abbrev": { - "version": "1.1.1", - "bundled": true, - "optional": true - }, - "ansi-regex": { - "version": "2.1.1", - "bundled": true - }, - "aproba": { - "version": "1.2.0", - "bundled": true, - "optional": true - }, - "are-we-there-yet": { - "version": "1.1.4", - "bundled": true, - "optional": true, - "requires": { - "delegates": "^1.0.0", - "readable-stream": "^2.0.6" - } - }, - "balanced-match": { - "version": "1.0.0", - "bundled": true - }, - "brace-expansion": { - "version": "1.1.11", - "bundled": true, - "requires": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "chownr": { - "version": "1.0.1", - "bundled": true, - "optional": true - }, - "code-point-at": { - "version": "1.1.0", - "bundled": true - }, - "concat-map": { - "version": "0.0.1", - "bundled": true - }, - "console-control-strings": { - "version": "1.1.0", - "bundled": true - }, - "core-util-is": { - "version": "1.0.2", - "bundled": true, - "optional": true - }, - "debug": { - "version": "2.6.9", - "bundled": true, - "optional": true, - "requires": { - "ms": "2.0.0" - } - }, - "deep-extend": { - "version": "0.5.1", - "bundled": true, - "optional": true - }, - "delegates": { - "version": "1.0.0", - "bundled": true, - "optional": true - }, - "detect-libc": { - "version": "1.0.3", - "bundled": true, - "optional": true - }, - "fs-minipass": { - "version": "1.2.5", - "bundled": true, - "optional": true, - "requires": { - "minipass": "^2.2.1" - } - }, - "fs.realpath": { - "version": "1.0.0", - "bundled": true, - "optional": true - }, - "gauge": { - "version": "2.7.4", - "bundled": true, - "optional": true, - "requires": { - "aproba": "^1.0.3", - "console-control-strings": "^1.0.0", - "has-unicode": "^2.0.0", - "object-assign": "^4.1.0", - "signal-exit": "^3.0.0", - "string-width": "^1.0.1", - "strip-ansi": "^3.0.1", - "wide-align": "^1.1.0" - } - }, - "glob": { - "version": "7.1.2", - "bundled": true, - "optional": true, - "requires": { - "fs.realpath": "^1.0.0", - "inflight": "^1.0.4", - "inherits": "2", - "minimatch": "^3.0.4", - "once": "^1.3.0", - "path-is-absolute": "^1.0.0" - } - }, - "has-unicode": { - "version": "2.0.1", - "bundled": true, - "optional": true - }, - "iconv-lite": { - "version": "0.4.21", - "bundled": true, - "optional": true, - "requires": { - "safer-buffer": "^2.1.0" - } - }, - "ignore-walk": { - "version": "3.0.1", - "bundled": true, - "optional": true, - "requires": { - "minimatch": "^3.0.4" - } - }, - "inflight": { - "version": "1.0.6", - "bundled": true, - "optional": true, - "requires": { - "once": "^1.3.0", - "wrappy": "1" - } - }, - "inherits": { - "version": "2.0.3", - "bundled": true - }, - "ini": { - "version": "1.3.5", - "bundled": true, - "optional": true - }, - "is-fullwidth-code-point": { - "version": "1.0.0", - "bundled": true, - "requires": { - "number-is-nan": "^1.0.0" - } - }, - "isarray": { - "version": "1.0.0", - "bundled": true, - "optional": true - }, - "minimatch": { - "version": "3.0.4", - "bundled": true, - "requires": { - "brace-expansion": "^1.1.7" - } - }, - "minimist": { - "version": "0.0.8", - "bundled": true - }, - "minipass": { - "version": "2.2.4", - "bundled": true, - "requires": { - "safe-buffer": "^5.1.1", - "yallist": "^3.0.0" - } - }, - "minizlib": { - "version": "1.1.0", - "bundled": true, - "optional": true, - "requires": { - "minipass": "^2.2.1" - } - }, - "mkdirp": { - "version": "0.5.1", - "bundled": true, - "requires": { - "minimist": "0.0.8" - } - }, - "ms": { - "version": "2.0.0", - "bundled": true, - "optional": true - }, - "needle": { - "version": "2.2.0", - "bundled": true, - "optional": true, - "requires": { - "debug": "^2.1.2", - "iconv-lite": "^0.4.4", - "sax": "^1.2.4" - } - }, - "node-pre-gyp": { - "version": "0.10.0", - "bundled": true, - "optional": true, - "requires": { - "detect-libc": "^1.0.2", - "mkdirp": "^0.5.1", - "needle": "^2.2.0", - "nopt": "^4.0.1", - "npm-packlist": "^1.1.6", - "npmlog": "^4.0.2", - "rc": "^1.1.7", - "rimraf": "^2.6.1", - "semver": "^5.3.0", - "tar": "^4" - } - }, - "nopt": { - "version": "4.0.1", - "bundled": true, - "optional": true, - "requires": { - "abbrev": "1", - "osenv": "^0.1.4" - } - }, - "npm-bundled": { - "version": "1.0.3", - "bundled": true, - "optional": true - }, - "npm-packlist": { - "version": "1.1.10", - "bundled": true, - "optional": true, - "requires": { - "ignore-walk": "^3.0.1", - "npm-bundled": "^1.0.1" - } - }, - "npmlog": { - "version": "4.1.2", - "bundled": true, - "optional": true, - "requires": { - "are-we-there-yet": "~1.1.2", - "console-control-strings": "~1.1.0", - "gauge": "~2.7.3", - "set-blocking": "~2.0.0" - } - }, - "number-is-nan": { - "version": "1.0.1", - "bundled": true - }, - "object-assign": { - "version": "4.1.1", - "bundled": true, - "optional": true - }, - "once": { - "version": "1.4.0", - "bundled": true, - "requires": { - "wrappy": "1" - } - }, - "os-homedir": { - "version": "1.0.2", - "bundled": true, - "optional": true - }, - "os-tmpdir": { - "version": "1.0.2", - "bundled": true, - "optional": true - }, - "osenv": { - "version": "0.1.5", - "bundled": true, - "optional": true, - "requires": { - "os-homedir": "^1.0.0", - "os-tmpdir": "^1.0.0" - } - }, - "path-is-absolute": { - "version": "1.0.1", - "bundled": true, - "optional": true - }, - "process-nextick-args": { - "version": "2.0.0", - "bundled": true, - "optional": true - }, - "rc": { - "version": "1.2.7", - "bundled": true, - "optional": true, - "requires": { - "deep-extend": "^0.5.1", - "ini": "~1.3.0", - "minimist": "^1.2.0", - "strip-json-comments": "~2.0.1" - }, - "dependencies": { - "minimist": { - "version": "1.2.0", - "bundled": true, - "optional": true - } - } - }, - "readable-stream": { - "version": "2.3.6", - "bundled": true, - "optional": true, - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.3", - "isarray": "~1.0.0", - "process-nextick-args": "~2.0.0", - "safe-buffer": "~5.1.1", - "string_decoder": "~1.1.1", - "util-deprecate": "~1.0.1" - } - }, - "rimraf": { - "version": "2.6.2", - "bundled": true, - "optional": true, - "requires": { - "glob": "^7.0.5" - } - }, - "safe-buffer": { - "version": "5.1.1", - "bundled": true - }, - "safer-buffer": { - "version": "2.1.2", - "bundled": true, - "optional": true - }, - "sax": { - "version": "1.2.4", - "bundled": true, - "optional": true - }, - "semver": { - "version": "5.5.0", - "bundled": true, - "optional": true - }, - "set-blocking": { - "version": "2.0.0", - "bundled": true, - "optional": true - }, - "signal-exit": { - "version": "3.0.2", - "bundled": true, - "optional": true - }, - "string-width": { - "version": "1.0.2", - "bundled": true, - "requires": { - "code-point-at": "^1.0.0", - "is-fullwidth-code-point": "^1.0.0", - "strip-ansi": "^3.0.0" - } - }, - "string_decoder": { - "version": "1.1.1", - "bundled": true, - "optional": true, - "requires": { - "safe-buffer": "~5.1.0" - } - }, - "strip-ansi": { - "version": "3.0.1", - "bundled": true, - "requires": { - "ansi-regex": "^2.0.0" - } - }, - "strip-json-comments": { - "version": "2.0.1", - "bundled": true, - "optional": true - }, - "tar": { - "version": "4.4.1", - "bundled": true, - "optional": true, - "requires": { - "chownr": "^1.0.1", - "fs-minipass": "^1.2.5", - "minipass": "^2.2.4", - "minizlib": "^1.1.0", - "mkdirp": "^0.5.0", - "safe-buffer": "^5.1.1", - "yallist": "^3.0.2" - } - }, - "util-deprecate": { - "version": "1.0.2", - "bundled": true, - "optional": true - }, - "wide-align": { - "version": "1.1.2", - "bundled": true, - "optional": true, - "requires": { - "string-width": "^1.0.2" - } - }, - "wrappy": { - "version": "1.0.2", - "bundled": true - }, - "yallist": { - "version": "3.0.2", - "bundled": true - } - } - }, "fstream": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/fstream/-/fstream-1.0.11.tgz", @@ -8322,14 +7809,14 @@ } }, "gatsby-source-filesystem": { - "version": "2.0.20", - "resolved": "https://registry.npmjs.org/gatsby-source-filesystem/-/gatsby-source-filesystem-2.0.20.tgz", - "integrity": "sha512-nS2hBsqKEQIJ5Yd+g9p++FcsfmvbQmZlBUzx04VPBYZBu2LuLA/ZxQkmdiTNnbDQ18KJw0Zu2PnmUerPnEMqyg==", + "version": "2.0.24", + "resolved": "https://registry.npmjs.org/gatsby-source-filesystem/-/gatsby-source-filesystem-2.0.24.tgz", + "integrity": "sha512-KzyHzuXni9hOiZFDgeoH5ABJZqb59fSJNGr2C4U6B1AlGXFMucFK45Fh3V8axtpi833bIbCb9rGmK+tvL4Qb1w==", "requires": { "@babel/runtime": "^7.0.0", "better-queue": "^3.8.7", "bluebird": "^3.5.0", - "chokidar": "^1.7.0", + "chokidar": "^2.1.2", "file-type": "^10.2.0", "fs-extra": "^5.0.0", "got": "^7.1.0", @@ -8343,83 +7830,6 @@ "xstate": "^3.1.0" }, "dependencies": { - "anymatch": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-1.3.2.tgz", - "integrity": "sha512-0XNayC8lTHQ2OI8aljNCN3sSx6hsr/1+rlcDAotXJR7C1oZZHCNsfpbKwMjRA3Uqb5tF1Rae2oloTr4xpq+WjA==", - "requires": { - "micromatch": "^2.1.5", - "normalize-path": "^2.0.0" - } - }, - "arr-diff": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", - "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", - "requires": { - "arr-flatten": "^1.0.1" - } - }, - "array-unique": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", - "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=" - }, - "braces": { - "version": "1.8.5", - "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", - "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", - "requires": { - "expand-range": "^1.8.1", - "preserve": "^0.2.0", - "repeat-element": "^1.1.2" - } - }, - "chokidar": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz", - "integrity": "sha1-eY5ol3gVHIB2tLNg5e3SjNortGg=", - "requires": { - "anymatch": "^1.3.0", - "async-each": "^1.0.0", - "fsevents": "^1.0.0", - "glob-parent": "^2.0.0", - "inherits": "^2.0.1", - "is-binary-path": "^1.0.0", - "is-glob": "^2.0.0", - "path-is-absolute": "^1.0.0", - "readdirp": "^2.0.0" - } - }, - "expand-brackets": { - "version": "0.1.5", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", - "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", - "requires": { - "is-posix-bracket": "^0.1.0" - } - }, - "extglob": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", - "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", - "requires": { - "is-extglob": "^1.0.0" - } - }, - "file-type": { - "version": "10.7.1", - "resolved": "https://registry.npmjs.org/file-type/-/file-type-10.7.1.tgz", - "integrity": "sha512-kUc4EE9q3MH6kx70KumPOvXLZLEJZzY9phEVg/bKWyGZ+OA9KoKZzFR4HS0yDmNv31sJkdf4hbTERIfplF9OxQ==" - }, - "glob-parent": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", - "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", - "requires": { - "is-glob": "^2.0.0" - } - }, "got": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/got/-/got-7.1.0.tgz", @@ -8441,47 +7851,6 @@ "url-to-options": "^1.0.1" } }, - "is-extglob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", - "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=" - }, - "is-glob": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", - "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", - "requires": { - "is-extglob": "^1.0.0" - } - }, - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "requires": { - "is-buffer": "^1.1.5" - } - }, - "micromatch": { - "version": "2.3.11", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", - "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", - "requires": { - "arr-diff": "^2.0.0", - "array-unique": "^0.2.1", - "braces": "^1.8.2", - "expand-brackets": "^0.1.4", - "extglob": "^0.3.1", - "filename-regex": "^2.0.0", - "is-extglob": "^1.0.0", - "is-glob": "^2.0.1", - "kind-of": "^3.0.2", - "normalize-path": "^2.0.1", - "object.omit": "^2.0.0", - "parse-glob": "^3.0.4", - "regex-cache": "^0.4.2" - } - }, "pify": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/pify/-/pify-4.0.1.tgz", @@ -8493,12 +7862,12 @@ "integrity": "sha1-4mDHj2Fhzdmw5WzD4Khd4Xx6V74=" }, "read-chunk": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/read-chunk/-/read-chunk-3.0.0.tgz", - "integrity": "sha512-8lBUVPjj9TC5bKLBacB+rpexM03+LWiYbv6ma3BeWmUYXGxqA1WNNgIZHq/iIsCrbFMzPhFbkOqdsyOFRnuoXg==", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/read-chunk/-/read-chunk-3.1.0.tgz", + "integrity": "sha512-ZdiZJXXoZYE08SzZvTipHhI+ZW0FpzxmFtLI3vIeMuRN9ySbIZ+SZawKogqJ7dxW9fJ/W73BNtxu4Zu/bZp+Ng==", "requires": { - "pify": "^4.0.0", - "with-open-file": "^0.1.3" + "pify": "^4.0.1", + "with-open-file": "^0.1.5" } } } @@ -8742,38 +8111,6 @@ "path-is-absolute": "^1.0.0" } }, - "glob-base": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/glob-base/-/glob-base-0.3.0.tgz", - "integrity": "sha1-27Fk9iIbHAscz4Kuoyi0l98Oo8Q=", - "requires": { - "glob-parent": "^2.0.0", - "is-glob": "^2.0.0" - }, - "dependencies": { - "glob-parent": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", - "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", - "requires": { - "is-glob": "^2.0.0" - } - }, - "is-extglob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", - "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=" - }, - "is-glob": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", - "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", - "requires": { - "is-extglob": "^1.0.0" - } - } - } - }, "glob-parent": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", @@ -10110,19 +9447,6 @@ "resolved": "https://registry.npmjs.org/is-directory/-/is-directory-0.3.1.tgz", "integrity": "sha1-YTObbyR1/Hcv2cnYP1yFddwVSuE=" }, - "is-dotfile": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/is-dotfile/-/is-dotfile-1.0.3.tgz", - "integrity": "sha1-pqLzL/0t+wT1yiXs0Pa4PPeYoeE=" - }, - "is-equal-shallow": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/is-equal-shallow/-/is-equal-shallow-0.1.3.tgz", - "integrity": "sha1-IjgJj8Ih3gvPpdnqxMRdY4qhxTQ=", - "requires": { - "is-primitive": "^2.0.0" - } - }, "is-extendable": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", @@ -10263,16 +9587,6 @@ "resolved": "https://registry.npmjs.org/is-png/-/is-png-1.1.0.tgz", "integrity": "sha1-1XSxK/J1wDUEVVcLDltXqwYgd84=" }, - "is-posix-bracket": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/is-posix-bracket/-/is-posix-bracket-0.1.1.tgz", - "integrity": "sha1-MzTceXdDaOkvAW5vvAqI9c1ua8Q=" - }, - "is-primitive": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/is-primitive/-/is-primitive-2.0.0.tgz", - "integrity": "sha1-IHurkWOEmcB7Kt8kCkGochADRXU=" - }, "is-promise": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-2.1.0.tgz", @@ -11162,11 +10476,6 @@ "resolved": "https://registry.npmjs.org/marked/-/marked-0.4.0.tgz", "integrity": "sha512-tMsdNBgOsrUophCAFQl0XPe6Zqk/uy9gnue+jIIKhykO51hxyu6uNx7zBPy0+y/WKYVZZMspV9YeXLNdKk+iYw==" }, - "math-random": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/math-random/-/math-random-1.0.1.tgz", - "integrity": "sha1-izqsWIuKZuSXXjzepn97sylgH6w=" - }, "md-attr-parser": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/md-attr-parser/-/md-attr-parser-1.2.1.tgz", @@ -12230,15 +11539,6 @@ "es-abstract": "^1.5.1" } }, - "object.omit": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/object.omit/-/object.omit-2.0.1.tgz", - "integrity": "sha1-Gpx0SCnznbuFjHbKNXmuKlTr0fo=", - "requires": { - "for-own": "^0.1.4", - "is-extendable": "^0.1.1" - } - }, "object.pick": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/object.pick/-/object.pick-1.3.0.tgz", @@ -12579,32 +11879,6 @@ "path-root": "^0.1.1" } }, - "parse-glob": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/parse-glob/-/parse-glob-3.0.4.tgz", - "integrity": "sha1-ssN2z7EfNVE7rdFz7wu246OIORw=", - "requires": { - "glob-base": "^0.3.0", - "is-dotfile": "^1.0.0", - "is-extglob": "^1.0.0", - "is-glob": "^2.0.0" - }, - "dependencies": { - "is-extglob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", - "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=" - }, - "is-glob": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", - "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", - "requires": { - "is-extglob": "^1.0.0" - } - } - } - }, "parse-headers": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/parse-headers/-/parse-headers-2.0.1.tgz", @@ -14769,11 +14043,6 @@ "resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-1.0.4.tgz", "integrity": "sha1-1PRWKwzjaW5BrFLQ4ALlemNdxtw=" }, - "preserve": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/preserve/-/preserve-0.2.0.tgz", - "integrity": "sha1-gV7R9uvGWSb4ZbMQwHE7yzMVzks=" - }, "prettier": { "version": "1.16.4", "resolved": "https://registry.npmjs.org/prettier/-/prettier-1.16.4.tgz", @@ -14982,23 +14251,6 @@ "resolved": "http://registry.npmjs.org/ramda/-/ramda-0.21.0.tgz", "integrity": "sha1-oAGr7bP/YQd9T/HVd9RN536NCjU=" }, - "randomatic": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/randomatic/-/randomatic-3.1.1.tgz", - "integrity": "sha512-TuDE5KxZ0J461RVjrJZCJc+J+zCkTb1MbH9AQUq68sMhOMcy9jLcb3BrZKgp9q9Ncltdg4QVqWrH02W2EFFVYw==", - "requires": { - "is-number": "^4.0.0", - "kind-of": "^6.0.0", - "math-random": "^1.0.1" - }, - "dependencies": { - "is-number": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-4.0.0.tgz", - "integrity": "sha512-rSklcAIlf1OmFdyAqbnWTLVelsQ58uvZ66S/ZyawjWqIviTWCjg2PzVGw8WUA+nNuPTqb4wgA+NszrJ+08LlgQ==" - } - } - }, "randombytes": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", @@ -15458,14 +14710,6 @@ "private": "^0.1.6" } }, - "regex-cache": { - "version": "0.4.4", - "resolved": "https://registry.npmjs.org/regex-cache/-/regex-cache-0.4.4.tgz", - "integrity": "sha512-nVIZwtCjkC9YgvWkpM55B5rBhBYRZhAaJbgcFYXXsHnbZ9UZI9nnVWYZpBlCqv9ho2eZryPnWrZGsOdPwVWXWQ==", - "requires": { - "is-equal-shallow": "^0.1.3" - } - }, "regex-not": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz", @@ -17710,9 +16954,9 @@ }, "dependencies": { "ajv": { - "version": "6.9.2", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.9.2.tgz", - "integrity": "sha512-4UFy0/LgDo7Oa/+wOAlj44tp9K78u38E5/359eSrqEp1Z5PdVfimCcs7SluXMP755RUQu6d2b4AvF0R1C9RZjg==", + "version": "6.10.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.10.0.tgz", + "integrity": "sha512-nffhOpkymDECQyR0mnsUtoCE8RlX38G0rYP+wgLWFyZuUyuuojSSvi/+euOiQBIn63whYwYVIIH1TvE3tu4OEg==", "requires": { "fast-deep-equal": "^2.0.1", "fast-json-stable-stringify": "^2.0.0", @@ -17721,26 +16965,26 @@ } }, "ansi-regex": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.0.0.tgz", - "integrity": "sha512-iB5Dda8t/UqpPI/IjsejXu5jOGDrzn41wJyljwPH65VCIbk6+1BzFIMJGFwTNrYXT1CrD+B4l19U7awiQ8rk7w==" + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.0.tgz", + "integrity": "sha512-1apePfXM1UOSqw0o9IiFAovVz9M5S1Dg+4TrDwfMewQ6p/rmMueb7tWZjQ1rx4Loy1ArBggoqGpfqqdI4rondg==" }, "string-width": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.0.0.tgz", - "integrity": "sha512-rr8CUxBbvOZDUvc5lNIJ+OC1nPVpz+Siw9VBtUjB9b6jZehZLFt0JMCZzShFHIsI8cbhm0EsNIfWJMFV3cu3Ew==", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.1.0.tgz", + "integrity": "sha512-vafcv6KjVZKSgz06oM/H6GDBrAtz8vdhQakGjFIvNrHA6y3HCF1CInLy+QLq8dTJPQ1b+KDUqDFctkdRW44e1w==", "requires": { "emoji-regex": "^7.0.1", "is-fullwidth-code-point": "^2.0.0", - "strip-ansi": "^5.0.0" + "strip-ansi": "^5.1.0" } }, "strip-ansi": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.0.0.tgz", - "integrity": "sha512-Uu7gQyZI7J7gn5qLn1Np3G9vcYGTVqB+lFTytnDJv83dd8T22aGH451P3jueT2/QemInJDfxHB5Tde5OzgG1Ow==", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.1.0.tgz", + "integrity": "sha512-TjxrkPONqO2Z8QDCpeE2j6n0M6EwxzyDgzEeGp+FbdvaJAt//ClYi6W5my+3ROlC/hZX2KACUwDfK49Ka5eDvg==", "requires": { - "ansi-regex": "^4.0.0" + "ansi-regex": "^4.1.0" } } } diff --git a/website/package.json b/website/package.json index 724eff194..916ac4d6d 100644 --- a/website/package.json +++ b/website/package.json @@ -35,7 +35,7 @@ "gatsby-remark-prismjs": "^3.2.4", "gatsby-remark-smartypants": "^2.0.8", "gatsby-remark-unwrap-images": "^1.0.1", - "gatsby-source-filesystem": "^2.0.20", + "gatsby-source-filesystem": "^2.0.24", "gatsby-transformer-remark": "^2.2.5", "gatsby-transformer-sharp": "^2.1.13", "html-to-react": "^1.3.4", diff --git a/website/src/components/accordion.js b/website/src/components/accordion.js index 7653c93ed..75485e698 100644 --- a/website/src/components/accordion.js +++ b/website/src/components/accordion.js @@ -7,7 +7,7 @@ import Link from './link' import classes from '../styles/accordion.module.sass' const Accordion = ({ title, id, expanded, children }) => { - const anchorId = id ? id : slugify(title) + const anchorId = id || slugify(title) const [isExpanded, setIsExpanded] = useState(expanded) const contentClassNames = classNames(classes.content, { [classes.hidden]: !isExpanded, diff --git a/website/src/components/github.js b/website/src/components/github.js index 73555da65..bc857f839 100644 --- a/website/src/components/github.js +++ b/website/src/components/github.js @@ -33,10 +33,11 @@ const GitHubCode = ({ url, lang, errorMsg, className }) => { }) .catch(err => { setCode(errorMsg) + console.error(err) }) setInitialized(true) } - }, []) + }, [initialized, rawUrl, errorMsg]) const highlighted = lang === 'none' || !code ? code : highlightCode(lang, code) diff --git a/website/src/components/progress.js b/website/src/components/progress.js index 0a71f1cf5..baf1d09d0 100644 --- a/website/src/components/progress.js +++ b/website/src/components/progress.js @@ -34,22 +34,19 @@ const Progress = () => { setOffset(getOffset()) } - useEffect( - () => { - if (!initialized && progressRef.current) { - handleResize() - setInitialized(true) - } - window.addEventListener('scroll', handleScroll) - window.addEventListener('resize', handleResize) + useEffect(() => { + if (!initialized && progressRef.current) { + handleResize() + setInitialized(true) + } + window.addEventListener('scroll', handleScroll) + window.addEventListener('resize', handleResize) - return () => { - window.removeEventListener('scroll', handleScroll) - window.removeEventListener('resize', handleResize) - } - }, - [progressRef] - ) + return () => { + window.removeEventListener('scroll', handleScroll) + window.removeEventListener('resize', handleResize) + } + }, [initialized, progressRef]) const { height, vh } = offset const total = 100 - ((height - scrollY - vh) / height) * 100 diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js index eae3a1a1e..c3a5b3f8c 100644 --- a/website/src/components/quickstart.js +++ b/website/src/components/quickstart.js @@ -8,6 +8,12 @@ import Icon from './icon' import { H2 } from './typography' import classes from '../styles/quickstart.module.sass' +function getNewChecked(optionId, checkedForId, multiple) { + if (!multiple) return [optionId] + if (checkedForId.includes(optionId)) return checkedForId.filter(opt => opt !== optionId) + return [...checkedForId, optionId] +} + const Quickstart = ({ data, title, description, id, children }) => { const [styles, setStyles] = useState({}) const [checked, setChecked] = useState({}) @@ -38,7 +44,7 @@ const Quickstart = ({ data, title, description, id, children }) => { setStyles(initialStyles) setInitialized(true) } - }) + }, [data, initialized]) return !data.length ? null : (
@@ -76,13 +82,11 @@ const Quickstart = ({ data, title, description, id, children }) => { onChange={() => { const newChecked = { ...checked, - [id]: !multiple - ? [option.id] - : checkedForId.includes(option.id) - ? checkedForId.filter( - opt => opt !== option.id - ) - : [...checkedForId, option.id], + [id]: getNewChecked( + option.id, + checkedForId, + multiple + ), } setChecked(newChecked) setStyles({ diff --git a/website/src/components/search.js b/website/src/components/search.js index 8e68143a1..9d63c06b5 100644 --- a/website/src/components/search.js +++ b/website/src/components/search.js @@ -7,10 +7,10 @@ import classes from '../styles/search.module.sass' const Search = ({ id, placeholder, settings }) => { const { apiKey, indexName } = settings - const [isInitialized, setIsInitialized] = useState(false) + const [initialized, setInitialized] = useState(false) useEffect(() => { - if (!isInitialized) { - setIsInitialized(true) + if (!initialized) { + setInitialized(true) window.docsearch({ apiKey, indexName, @@ -18,7 +18,7 @@ const Search = ({ id, placeholder, settings }) => { debug: false, }) } - }, window.docsearch) + }, [initialized, apiKey, indexName, id]) return (