From 98799d849e2a78f54797178e45fbe37c5161943d Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Wed, 26 Jul 2023 13:56:31 +0200 Subject: [PATCH 1/5] `SpanCat`: Remove invalid `threshold` config argument (#12860) --- website/docs/api/spancategorizer.mdx | 1 - 1 file changed, 1 deletion(-) diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index 81a473ac2..2b63d31ce 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -67,7 +67,6 @@ architectures and their arguments and hyperparameters. > ```python > from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL > config = { -> "threshold": 0.5, > "spans_key": "labeled_spans", > "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, > "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, From 51b9655470aca59df3adacc4b05c77cde6e5579b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 26 Jul 2023 16:05:53 +0200 Subject: [PATCH 2/5] Added OdyCy to spaCy Universe (#12826) * Added OdyCy to spaCy Universe * Replaced template tags Co-authored-by: Adriane Boyd --------- Co-authored-by: Adriane Boyd --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 75ec5fb5c..041ebbff8 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -67,6 +67,33 @@ "category": ["pipeline", "research"], "tags": ["latin"] }, + { + "id": "odycy", + "title": "OdyCy", + "slogan": "General-purpose language pipelines for premodern Greek.", + "description": "Academically validated modular NLP pipelines for premodern Greek. odyCy achieves state of the art performance on multiple tasks on unseen test data from the Universal Dependencies Perseus treebank, and performs second best on the PROIEL treebank’s test set on even more tasks. In addition performance also seems relatively stable across the two evaluation datasets in comparison with other NLP pipelines. OdyCy is being used at the Center for Humanities Computing for preprocessing and analyzing Ancient Greek corpora for New Testament research, meaning that you can expect consistent maintenance and improvements.", + "github": "centre-for-humanities-computing/odyCy", + "code_example": [ + "# To install the high-accuracy transformer-based pipeline", + "# pip install https://huggingface.co/chcaa/grc_odycy_joint_trf/resolve/main/grc_odycy_joint_trf-any-py3-none-any.whl", + "import spacy", + "", + "nlp = spacy.load('grc_odycy_joint_trf')", + "", + "doc = nlp('τὴν γοῦν Ἀττικὴν ἐκ τοῦ ἐπὶ πλεῖστον διὰ τὸ λεπτόγεων ἀστασίαστον οὖσαν ἄνθρωποι ᾤκουν οἱ αὐτοὶ αἰεί.')" + ], + "code_language": "python", + "url": "https://centre-for-humanities-computing.github.io/odyCy/", + "thumb": "https://raw.githubusercontent.com/centre-for-humanities-computing/odyCy/7b94fec60679d06272dca88a4dcfe0f329779aea/docs/_static/logo.svg", + "image": "https://github.com/centre-for-humanities-computing/odyCy/raw/main/docs/_static/logo_with_text_below.svg", + "author": "Jan Kostkan, Márton Kardos (Center for Humanities Computing, Aarhus University)", + "author_links": { + "github": "centre-for-humanities-computing", + "website": "https://chc.au.dk/" + }, + "category": ["pipeline", "standalone", "research"], + "tags": ["ancient Greek"] + }, { "id": "spacy-wasm", "title": "spacy-wasm", From 49055ed7c825bc5c6ce828ddf0ff0bcbf1527a7b Mon Sep 17 00:00:00 2001 From: Victoria <80417010+victorialslocum@users.noreply.github.com> Date: Mon, 31 Jul 2023 09:39:00 +0200 Subject: [PATCH 3/5] Add cli for finding locations of registered func (#12757) * Add cli for finding locations of registered func * fixes: naming and typing * isort * update naming * remove to find-function * remove file:// bit * use registry name if given and exit gracefully if a registry was not found * clean up failure msg * specify registry_name options * mypy fixes * return location for internal usage * add documentation * more mypy fixes * clean up example * add section to menu * add tests --------- Co-authored-by: svlandeg --- spacy/cli/__init__.py | 1 + spacy/cli/find_function.py | 69 +++++++++++++++++++++++++++++++++++++ spacy/tests/test_cli_app.py | 34 ++++++++++++++++++ website/docs/api/cli.mdx | 36 +++++++++++++++---- 4 files changed, 133 insertions(+), 7 deletions(-) create mode 100644 spacy/cli/find_function.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 549a27616..60fe718c7 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -14,6 +14,7 @@ from .debug_diff import debug_diff # noqa: F401 from .debug_model import debug_model # noqa: F401 from .download import download # noqa: F401 from .evaluate import evaluate # noqa: F401 +from .find_function import find_function # noqa: F401 from .find_threshold import find_threshold # noqa: F401 from .info import info # noqa: F401 from .init_config import fill_config, init_config # noqa: F401 diff --git a/spacy/cli/find_function.py b/spacy/cli/find_function.py new file mode 100644 index 000000000..f99ce2adc --- /dev/null +++ b/spacy/cli/find_function.py @@ -0,0 +1,69 @@ +from typing import Optional, Tuple + +from catalogue import RegistryError +from wasabi import msg + +from ..util import registry +from ._util import Arg, Opt, app + + +@app.command("find-function") +def find_function_cli( + # fmt: off + func_name: str = Arg(..., help="Name of the registered function."), + registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."), + # fmt: on +): + """ + Find the module, path and line number to the file the registered + function is defined in, if available. + + func_name (str): Name of the registered function. + registry_name (Optional[str]): Name of the catalogue registry. + + DOCS: https://spacy.io/api/cli#find-function + """ + if not registry_name: + registry_names = registry.get_registry_names() + for name in registry_names: + if registry.has(name, func_name): + registry_name = name + break + + if not registry_name: + msg.fail( + f"Couldn't find registered function: '{func_name}'", + exits=1, + ) + + assert registry_name is not None + find_function(func_name, registry_name) + + +def find_function(func_name: str, registry_name: str) -> Tuple[str, int]: + registry_desc = None + try: + registry_desc = registry.find(registry_name, func_name) + except RegistryError as e: + msg.fail( + f"Couldn't find registered function: '{func_name}' in registry '{registry_name}'", + ) + msg.fail(f"{e}", exits=1) + assert registry_desc is not None + + registry_path = None + line_no = None + if registry_desc["file"]: + registry_path = registry_desc["file"] + line_no = registry_desc["line_no"] + + if not registry_path or not line_no: + msg.fail( + f"Couldn't find path to registered function: '{func_name}' in registry '{registry_name}'", + exits=1, + ) + assert registry_path is not None + assert line_no is not None + + msg.good(f"Found registered function '{func_name}' at {registry_path}:{line_no}") + return str(registry_path), int(line_no) diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 3a426113b..0e6d8e252 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -233,3 +233,37 @@ def test_project_push_pull(project_dir): result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) assert result.exit_code == 0 assert test_file.is_file() + + +def test_find_function_valid(): + # example of architecture in main code base + function = "spacy.TextCatBOW.v2" + result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) + assert f"Found registered function '{function}'" in result.stdout + assert "textcat.py" in result.stdout + + result = CliRunner().invoke(app, ["find-function", function]) + assert f"Found registered function '{function}'" in result.stdout + assert "textcat.py" in result.stdout + + # example of architecture in spacy-legacy + function = "spacy.TextCatBOW.v1" + result = CliRunner().invoke(app, ["find-function", function]) + assert f"Found registered function '{function}'" in result.stdout + assert "spacy_legacy" in result.stdout + assert "textcat.py" in result.stdout + + +def test_find_function_invalid(): + # invalid registry + function = "spacy.TextCatBOW.v2" + registry = "foobar" + result = CliRunner().invoke( + app, ["find-function", function, "--registry", registry] + ) + assert f"Unknown function registry: '{registry}'" in result.stdout + + # invalid function + function = "spacy.TextCatBOW.v666" + result = CliRunner().invoke(app, ["find-function", function]) + assert f"Couldn't find registered function: '{function}'" in result.stdout diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 6a87f78b8..d63ac6e1d 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -7,6 +7,7 @@ menu: - ['info', 'info'] - ['validate', 'validate'] - ['init', 'init'] + - ['find-function', 'find-function'] - ['convert', 'convert'] - ['debug', 'debug'] - ['train', 'train'] @@ -251,6 +252,27 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The label files. | +## find-function {id="find-function",version="3.7",tag="command"} + +Find the module, path and line number to the file for a given registered +function. This functionality is helpful to understand where registered +functions, as used in the config file, are defined. + +```bash +$ python -m spacy find-function [func_name] [--registry] +``` + +> #### Example +> +> ```bash +> $ python -m spacy find-function spacy.TextCatBOW.v1 +> ``` + +| Name | Description | +| ------------------ | ----------------------------------------------------- | +| `func_name` | Name of the registered function. ~~str (positional)~~ | +| `--registry`, `-r` | Name of the catalogue registry. ~~str (option)~~ | + ## convert {id="convert",tag="command"} Convert files into spaCy's @@ -1651,10 +1673,10 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose] > $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl > ``` -| Name | Description | -| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | -| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | -| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | -| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ | -| **UPLOADS** | The pipeline to the hub. | +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------- | +| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | +| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | +| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | +| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ | +| **UPLOADS** | The pipeline to the hub. | From c9e9dccf7951bd474c5be8ca46dad7290ae86ee2 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 31 Jul 2023 10:47:57 +0200 Subject: [PATCH 4/5] Add displaCy data structures to docs (2) (#12875) * Add data structures to docs * Adjusted descriptions for more consistency * Add _optional_ flag to parameters * Add tests and adjust optional title key in doc * Add title to dep visualizations * fix typo --------- Co-authored-by: thomashacker --- spacy/displacy/render.py | 2 + spacy/tests/test_displacy.py | 72 +++++++++++++++++ website/docs/api/top-level.mdx | 124 +++++++++++++++++++++++++++++ website/docs/usage/visualizers.mdx | 3 +- 4 files changed, 200 insertions(+), 1 deletion(-) diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 47407bcb7..758dc07d5 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -313,6 +313,8 @@ class DependencyRenderer: self.lang = settings.get("lang", DEFAULT_LANG) render_id = f"{id_prefix}-{i}" svg = self.render_svg(render_id, p["words"], p["arcs"]) + if p.get("title"): + svg = TPL_TITLE.format(title=p.get("title")) + svg rendered.append(svg) if page: content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered]) diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 1570f8d09..e9b5a9aba 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -350,6 +350,78 @@ def test_displacy_render_wrapper(en_vocab): displacy.set_render_wrapper(lambda html: html) +def test_displacy_render_manual_dep(): + """Test displacy.render with manual data for dep style""" + parsed_dep = { + "words": [ + {"text": "This", "tag": "DT"}, + {"text": "is", "tag": "VBZ"}, + {"text": "a", "tag": "DT"}, + {"text": "sentence", "tag": "NN"}, + ], + "arcs": [ + {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "det", "dir": "left"}, + {"start": 1, "end": 3, "label": "attr", "dir": "right"}, + ], + "title": "Title", + } + html = displacy.render([parsed_dep], style="dep", manual=True) + for word in parsed_dep["words"]: + assert word["text"] in html + assert word["tag"] in html + + +def test_displacy_render_manual_ent(): + """Test displacy.render with manual data for ent style""" + parsed_ents = [ + { + "text": "But Google is starting from behind.", + "ents": [{"start": 4, "end": 10, "label": "ORG"}], + }, + { + "text": "But Google is starting from behind.", + "ents": [{"start": -100, "end": 100, "label": "COMPANY"}], + "title": "Title", + }, + ] + + html = displacy.render(parsed_ents, style="ent", manual=True) + for parsed_ent in parsed_ents: + assert parsed_ent["ents"][0]["label"] in html + if "title" in parsed_ent: + assert parsed_ent["title"] in html + + +def test_displacy_render_manual_span(): + """Test displacy.render with manual data for span style""" + parsed_spans = [ + { + "text": "Welcome to the Bank of China.", + "spans": [ + {"start_token": 3, "end_token": 6, "label": "ORG"}, + {"start_token": 5, "end_token": 6, "label": "GPE"}, + ], + "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."], + }, + { + "text": "Welcome to the Bank of China.", + "spans": [ + {"start_token": 3, "end_token": 6, "label": "ORG"}, + {"start_token": 5, "end_token": 6, "label": "GPE"}, + ], + "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."], + "title": "Title", + }, + ] + + html = displacy.render(parsed_spans, style="span", manual=True) + for parsed_span in parsed_spans: + assert parsed_span["spans"][0]["label"] in html + if "title" in parsed_span: + assert parsed_span["title"] in html + + def test_displacy_options_case(): ents = ["foo", "BAR"] colors = {"FOO": "red", "bar": "green"} diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 64ec342cd..37e86a4bc 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -343,6 +343,130 @@ use with the `manual=True` argument in `displacy.render`. | `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ | | **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ | +### Visualizer data structures {id="displacy_structures"} + +You can use displaCy's data format to manually render data. This can be useful +if you want to visualize output from other libraries. You can find examples of +displaCy's different data formats below. + +> #### DEP example data structure +> +> ```json +> { +> "words": [ +> { "text": "This", "tag": "DT" }, +> { "text": "is", "tag": "VBZ" }, +> { "text": "a", "tag": "DT" }, +> { "text": "sentence", "tag": "NN" } +> ], +> "arcs": [ +> { "start": 0, "end": 1, "label": "nsubj", "dir": "left" }, +> { "start": 2, "end": 3, "label": "det", "dir": "left" }, +> { "start": 1, "end": 3, "label": "attr", "dir": "right" } +> ] +> } +> ``` + +#### Dependency Visualizer data structure {id="structure-dep"} + +| Dictionary Key | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------- | +| `words` | List of dictionaries describing a word token (see structure below). ~~List[Dict[str, Any]]~~ | +| `arcs` | List of dictionaries describing the relations between words (see structure below). ~~List[Dict[str, Any]]~~ | +| _Optional_ | | +| `title` | Title of the visualization. ~~Optional[str]~~ | +| `settings` | Dependency Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ | + + + +| Dictionary Key | Description | +| -------------- | ---------------------------------------- | +| `text` | Text content of the word. ~~str~~ | +| `tag` | Fine-grained part-of-speech. ~~str~~ | +| `lemma` | Base form of the word. ~~Optional[str]~~ | + + + + + +| Dictionary Key | Description | +| -------------- | ---------------------------------------------------- | +| `start` | The index of the starting token. ~~int~~ | +| `end` | The index of the ending token. ~~int~~ | +| `label` | The type of dependency relation. ~~str~~ | +| `dir` | Direction of the relation (`left`, `right`). ~~str~~ | + + + +> #### ENT example data structure +> +> ```json +> { +> "text": "But Google is starting from behind.", +> "ents": [{ "start": 4, "end": 10, "label": "ORG" }] +> } +> ``` + +#### Named Entity Recognition data structure {id="structure-ent"} + +| Dictionary Key | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `text` | String representation of the document text. ~~str~~ | +| `ents` | List of dictionaries describing entities (see structure below). ~~List[Dict[str, Any]]~~ | +| _Optional_ | | +| `title` | Title of the visualization. ~~Optional[str]~~ | +| `settings` | Entity Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ | + + + +| Dictionary Key | Description | +| -------------- | ---------------------------------------------------------------------- | +| `start` | The index of the first character of the entity. ~~int~~ | +| `end` | The index of the last character of the entity. (not inclusive) ~~int~~ | +| `label` | Label attached to the entity. ~~str~~ | +| _Optional_ | | +| `kb_id` | `KnowledgeBase` ID. ~~str~~ | +| `kb_url` | `KnowledgeBase` URL. ~~str~~ | + + + +> #### SPAN example data structure +> +> ```json +> { +> "text": "Welcome to the Bank of China.", +> "spans": [ +> { "start_token": 3, "end_token": 6, "label": "ORG" }, +> { "start_token": 5, "end_token": 6, "label": "GPE" } +> ], +> "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."] +> } +> ``` + +#### Span Classification data structure {id="structure-span"} + +| Dictionary Key | Description | +| -------------- | ----------------------------------------------------------------------------------------- | +| `text` | String representation of the document text. ~~str~~ | +| `spans` | List of dictionaries describing spans (see structure below). ~~List[Dict[str, Any]]~~ | +| `tokens` | List of word tokens. ~~List[str]~~ | +| _Optional_ | | +| `title` | Title of the visualization. ~~Optional[str]~~ | +| `settings` | Span Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ | + + + +| Dictionary Key | Description | +| -------------- | ------------------------------------------------------------- | +| `start_token` | The index of the first token of the span in `tokens`. ~~int~~ | +| `end_token` | The index of the last token of the span in `tokens`. ~~int~~ | +| `label` | Label attached to the span. ~~str~~ | +| _Optional_ | | +| `kb_id` | `KnowledgeBase` ID. ~~str~~ | +| `kb_url` | `KnowledgeBase` URL. ~~str~~ | + + + ### Visualizer options {id="displacy_options"} The `options` argument lets you specify additional settings for each visualizer. diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx index 1ac931753..e73c4a16a 100644 --- a/website/docs/usage/visualizers.mdx +++ b/website/docs/usage/visualizers.mdx @@ -349,7 +349,8 @@ or [SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet). If you set `manual=True` on either `render()` or `serve()`, you can pass in data in displaCy's format as a dictionary (instead of `Doc` objects). There are -helper functions for converting `Doc` objects to displaCy's format for use with +helper functions for converting `Doc` objects to +[displaCy's format](/api/top-level#displacy_structures) for use with `manual=True`: [`displacy.parse_deps`](/api/top-level#displacy.parse_deps), [`displacy.parse_ents`](/api/top-level#displacy.parse_ents), and [`displacy.parse_spans`](/api/top-level#displacy.parse_spans). From 186889ec9c4c7a5b6b9b88bea0c6c74a763998ee Mon Sep 17 00:00:00 2001 From: Andy Friedman Date: Mon, 31 Jul 2023 04:52:32 -0400 Subject: [PATCH 5/5] added entry for SaysWho (#12828) * Update universe.json added entry for Sayswho * Update universe.json updated sayswho entry * Update universe.json * Update website/meta/universe.json * Update website/meta/universe.json --------- Co-authored-by: Adriane Boyd --- website/meta/universe.json | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 041ebbff8..2ed8b4b41 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -17,6 +17,31 @@ "category": ["extension"], "tags": [] }, + { + "id": "sayswho", + "title": "SaysWho", + "slogan": "Quote identification, attribution and resolution", + "description": "A Python package for identifying and attributing quotes in text. It uses a combination of spaCy functionality, logic and grammar to find quotes and their speakers, then uses the spaCy coreferencing model to better clarify who is speaking. Currently English only.", + "github": "afriedman412/sayswho", + "pip": "sayswho", + "code_language": "python", + "author": "Andy Friedman", + "author_links": { + "twitter": "@steadynappin", + "github": "afriedman412" + }, + "code_example": [ + "from sayswho import SaysWho", + "text = open(\"path/to/your/text_file.txt\").read()", + "sw = SaysWho()", + "sw.attribute(text)", + + "sw.expand_match() # see quote/cluster matches", + "sw.render_to_html() # output your text, quotes and cluster matches to an html file called \"temp.html\"" + ], + "category": ["standalone"], + "tags": ["attribution", "coref", "text-processing"] + }, { "id": "parsigs", "title": "parsigs",