Merge branch 'upstream_master' into test-cli-app-init-config

2025-08-02 03:10:22 +03:00 · 2023-07-31 11:09:09 +02:00 · 2023-07-31 11:09:09 +02:00 · d67807d91f
commit d67807d91f
parent 2f88c4ef09 186889ec9c
10 changed files with 385 additions and 9 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -14,6 +14,7 @@ from .debug_diff import debug_diff  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .download import download  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
+from .find_function import find_function  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
 from .info import info  # noqa: F401
 from .init_config import fill_config, init_config  # noqa: F401
--- a/spacy/cli/find_function.py
+++ b/spacy/cli/find_function.py
@ -0,0 +1,69 @@
+from typing import Optional, Tuple
+
+from catalogue import RegistryError
+from wasabi import msg
+
+from ..util import registry
+from ._util import Arg, Opt, app
+
+
+@app.command("find-function")
+def find_function_cli(
+    # fmt: off
+    func_name: str = Arg(..., help="Name of the registered function."),
+    registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."),
+    # fmt: on
+):
+    """
+    Find the module, path and line number to the file the registered
+    function is defined in, if available.
+
+    func_name (str): Name of the registered function.
+    registry_name (Optional[str]): Name of the catalogue registry.
+
+    DOCS: https://spacy.io/api/cli#find-function
+    """
+    if not registry_name:
+        registry_names = registry.get_registry_names()
+        for name in registry_names:
+            if registry.has(name, func_name):
+                registry_name = name
+                break
+
+    if not registry_name:
+        msg.fail(
+            f"Couldn't find registered function: '{func_name}'",
+            exits=1,
+        )
+
+    assert registry_name is not None
+    find_function(func_name, registry_name)
+
+
+def find_function(func_name: str, registry_name: str) -> Tuple[str, int]:
+    registry_desc = None
+    try:
+        registry_desc = registry.find(registry_name, func_name)
+    except RegistryError as e:
+        msg.fail(
+            f"Couldn't find registered function: '{func_name}' in registry '{registry_name}'",
+        )
+        msg.fail(f"{e}", exits=1)
+    assert registry_desc is not None
+
+    registry_path = None
+    line_no = None
+    if registry_desc["file"]:
+        registry_path = registry_desc["file"]
+        line_no = registry_desc["line_no"]
+
+    if not registry_path or not line_no:
+        msg.fail(
+            f"Couldn't find path to registered function: '{func_name}' in registry '{registry_name}'",
+            exits=1,
+        )
+    assert registry_path is not None
+    assert line_no is not None
+
+    msg.good(f"Found registered function '{func_name}' at {registry_path}:{line_no}")
+    return str(registry_path), int(line_no)
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -313,6 +313,8 @@ class DependencyRenderer:
                self.lang = settings.get("lang", DEFAULT_LANG)
            render_id = f"{id_prefix}-{i}"
            svg = self.render_svg(render_id, p["words"], p["arcs"])
+            if p.get("title"):
+                svg = TPL_TITLE.format(title=p.get("title")) + svg
            rendered.append(svg)
        if page:
            content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -235,6 +235,40 @@ def test_project_push_pull(project_dir):
        assert test_file.is_file()


+def test_find_function_valid():
+    # example of architecture in main code base
+    function = "spacy.TextCatBOW.v2"
+    result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
+    assert f"Found registered function '{function}'" in result.stdout
+    assert "textcat.py" in result.stdout
+
+    result = CliRunner().invoke(app, ["find-function", function])
+    assert f"Found registered function '{function}'" in result.stdout
+    assert "textcat.py" in result.stdout
+
+    # example of architecture in spacy-legacy
+    function = "spacy.TextCatBOW.v1"
+    result = CliRunner().invoke(app, ["find-function", function])
+    assert f"Found registered function '{function}'" in result.stdout
+    assert "spacy_legacy" in result.stdout
+    assert "textcat.py" in result.stdout
+
+
+def test_find_function_invalid():
+    # invalid registry
+    function = "spacy.TextCatBOW.v2"
+    registry = "foobar"
+    result = CliRunner().invoke(
+        app, ["find-function", function, "--registry", registry]
+    )
+    assert f"Unknown function registry: '{registry}'" in result.stdout
+
+    # invalid function
+    function = "spacy.TextCatBOW.v666"
+    result = CliRunner().invoke(app, ["find-function", function])
+    assert f"Couldn't find registered function: '{function}'" in result.stdout
+
+
 example_words_1 = ["I", "like", "cats"]
 example_words_2 = ["I", "like", "dogs"]
 example_lemmas_1 = ["I", "like", "cat"]
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@ -350,6 +350,78 @@ def test_displacy_render_wrapper(en_vocab):
    displacy.set_render_wrapper(lambda html: html)


+def test_displacy_render_manual_dep():
+    """Test displacy.render with manual data for dep style"""
+    parsed_dep = {
+        "words": [
+            {"text": "This", "tag": "DT"},
+            {"text": "is", "tag": "VBZ"},
+            {"text": "a", "tag": "DT"},
+            {"text": "sentence", "tag": "NN"},
+        ],
+        "arcs": [
+            {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
+            {"start": 2, "end": 3, "label": "det", "dir": "left"},
+            {"start": 1, "end": 3, "label": "attr", "dir": "right"},
+        ],
+        "title": "Title",
+    }
+    html = displacy.render([parsed_dep], style="dep", manual=True)
+    for word in parsed_dep["words"]:
+        assert word["text"] in html
+        assert word["tag"] in html
+
+
+def test_displacy_render_manual_ent():
+    """Test displacy.render with manual data for ent style"""
+    parsed_ents = [
+        {
+            "text": "But Google is starting from behind.",
+            "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+        },
+        {
+            "text": "But Google is starting from behind.",
+            "ents": [{"start": -100, "end": 100, "label": "COMPANY"}],
+            "title": "Title",
+        },
+    ]
+
+    html = displacy.render(parsed_ents, style="ent", manual=True)
+    for parsed_ent in parsed_ents:
+        assert parsed_ent["ents"][0]["label"] in html
+        if "title" in parsed_ent:
+            assert parsed_ent["title"] in html
+
+
+def test_displacy_render_manual_span():
+    """Test displacy.render with manual data for span style"""
+    parsed_spans = [
+        {
+            "text": "Welcome to the Bank of China.",
+            "spans": [
+                {"start_token": 3, "end_token": 6, "label": "ORG"},
+                {"start_token": 5, "end_token": 6, "label": "GPE"},
+            ],
+            "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
+        },
+        {
+            "text": "Welcome to the Bank of China.",
+            "spans": [
+                {"start_token": 3, "end_token": 6, "label": "ORG"},
+                {"start_token": 5, "end_token": 6, "label": "GPE"},
+            ],
+            "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
+            "title": "Title",
+        },
+    ]
+
+    html = displacy.render(parsed_spans, style="span", manual=True)
+    for parsed_span in parsed_spans:
+        assert parsed_span["spans"][0]["label"] in html
+        if "title" in parsed_span:
+            assert parsed_span["title"] in html
+
+
 def test_displacy_options_case():
    ents = ["foo", "BAR"]
    colors = {"FOO": "red", "bar": "green"}
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -7,6 +7,7 @@ menu:
  - ['info', 'info']
  - ['validate', 'validate']
  - ['init', 'init']
+  - ['find-function', 'find-function']
  - ['convert', 'convert']
  - ['debug', 'debug']
  - ['train', 'train']
@ -251,6 +252,27 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The label files.                                                                                                                                                                                                   |

+## find-function {id="find-function",version="3.7",tag="command"}
+
+Find the module, path and line number to the file for a given registered
+function. This functionality is helpful to understand where registered
+functions, as used in the config file, are defined.
+
+```bash
+$ python -m spacy find-function [func_name] [--registry]
+```
+
+> #### Example
+>
+> ```bash
+> $ python -m spacy find-function spacy.TextCatBOW.v1
+> ```
+
+| Name               | Description                                           |
+| ------------------ | ----------------------------------------------------- |
+| `func_name`        | Name of the registered function. ~~str (positional)~~ |
+| `--registry`, `-r` | Name of the catalogue registry. ~~str (option)~~      |
+
 ## convert {id="convert",tag="command"}

 Convert files into spaCy's
@ -1651,10 +1673,10 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
 > $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl
 > ```

-| Name                 | Description                                                                                                                                     |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
-| `whl_path`           | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~                             |
-| `--org`, `-o`        | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~                                                        |
-| `--msg`, `-m`        | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~                                                       |
-| `--verbose`, `-V`    | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~                                                     |
-| **UPLOADS**          | The pipeline to the hub.                                                                                                                        |
+| Name              | Description                                                                                                         |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `whl_path`        | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
+| `--org`, `-o`     | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~                            |
+| `--msg`, `-m`     | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~                           |
+| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~                         |
+| **UPLOADS**       | The pipeline to the hub.                                                                                            |
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@ -67,7 +67,6 @@ architectures and their arguments and hyperparameters.
 > ```python
 > from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
 > config = {
->     "threshold": 0.5,
 >     "spans_key": "labeled_spans",
 >     "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
 >     "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -343,6 +343,130 @@ use with the `manual=True` argument in `displacy.render`.
 | `options`   | Span-specific visualisation options. ~~Dict[str, Any]~~             |
 | **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |

+### Visualizer data structures {id="displacy_structures"}
+
+You can use displaCy's data format to manually render data. This can be useful
+if you want to visualize output from other libraries. You can find examples of
+displaCy's different data formats below.
+
+> #### DEP example data structure
+>
+> ```json
+> {
+>   "words": [
+>     { "text": "This", "tag": "DT" },
+>     { "text": "is", "tag": "VBZ" },
+>     { "text": "a", "tag": "DT" },
+>     { "text": "sentence", "tag": "NN" }
+>   ],
+>   "arcs": [
+>     { "start": 0, "end": 1, "label": "nsubj", "dir": "left" },
+>     { "start": 2, "end": 3, "label": "det", "dir": "left" },
+>     { "start": 1, "end": 3, "label": "attr", "dir": "right" }
+>   ]
+> }
+> ```
+
+#### Dependency Visualizer data structure {id="structure-dep"}
+
+| Dictionary Key | Description                                                                                                 |
+| -------------- | ----------------------------------------------------------------------------------------------------------- |
+| `words`        | List of dictionaries describing a word token (see structure below). ~~List[Dict[str, Any]]~~                |
+| `arcs`         | List of dictionaries describing the relations between words (see structure below). ~~List[Dict[str, Any]]~~ |
+| _Optional_     |                                                                                                             |
+| `title`        | Title of the visualization. ~~Optional[str]~~                                                               |
+| `settings`     | Dependency Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~             |
+
+<Accordion title="Words data structure">
+
+| Dictionary Key | Description                              |
+| -------------- | ---------------------------------------- |
+| `text`         | Text content of the word. ~~str~~        |
+| `tag`          | Fine-grained part-of-speech. ~~str~~     |
+| `lemma`        | Base form of the word. ~~Optional[str]~~ |
+
+</Accordion>
+
+<Accordion title="Arcs data structure">
+
+| Dictionary Key | Description                                          |
+| -------------- | ---------------------------------------------------- |
+| `start`        | The index of the starting token. ~~int~~             |
+| `end`          | The index of the ending token. ~~int~~               |
+| `label`        | The type of dependency relation. ~~str~~             |
+| `dir`          | Direction of the relation (`left`, `right`). ~~str~~ |
+
+</Accordion>
+
+> #### ENT example data structure
+>
+> ```json
+> {
+>   "text": "But Google is starting from behind.",
+>   "ents": [{ "start": 4, "end": 10, "label": "ORG" }]
+> }
+> ```
+
+#### Named Entity Recognition data structure {id="structure-ent"}
+
+| Dictionary Key | Description                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `text`         | String representation of the document text. ~~str~~                                         |
+| `ents`         | List of dictionaries describing entities (see structure below). ~~List[Dict[str, Any]]~~    |
+| _Optional_     |                                                                                             |
+| `title`        | Title of the visualization. ~~Optional[str]~~                                               |
+| `settings`     | Entity Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
+
+<Accordion title="Ents data structure">
+
+| Dictionary Key | Description                                                            |
+| -------------- | ---------------------------------------------------------------------- |
+| `start`        | The index of the first character of the entity. ~~int~~                |
+| `end`          | The index of the last character of the entity. (not inclusive) ~~int~~ |
+| `label`        | Label attached to the entity. ~~str~~                                  |
+| _Optional_     |                                                                        |
+| `kb_id`        | `KnowledgeBase` ID. ~~str~~                                            |
+| `kb_url`       | `KnowledgeBase` URL. ~~str~~                                           |
+
+</Accordion>
+
+> #### SPAN example data structure
+>
+> ```json
+> {
+>   "text": "Welcome to the Bank of China.",
+>   "spans": [
+>     { "start_token": 3, "end_token": 6, "label": "ORG" },
+>     { "start_token": 5, "end_token": 6, "label": "GPE" }
+>   ],
+>   "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."]
+> }
+> ```
+
+#### Span Classification data structure {id="structure-span"}
+
+| Dictionary Key | Description                                                                               |
+| -------------- | ----------------------------------------------------------------------------------------- |
+| `text`         | String representation of the document text. ~~str~~                                       |
+| `spans`        | List of dictionaries describing spans (see structure below). ~~List[Dict[str, Any]]~~     |
+| `tokens`       | List of word tokens. ~~List[str]~~                                                        |
+| _Optional_     |                                                                                           |
+| `title`        | Title of the visualization. ~~Optional[str]~~                                             |
+| `settings`     | Span Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
+
+<Accordion title="Spans data structure">
+
+| Dictionary Key | Description                                                   |
+| -------------- | ------------------------------------------------------------- |
+| `start_token`  | The index of the first token of the span in `tokens`. ~~int~~ |
+| `end_token`    | The index of the last token of the span in `tokens`. ~~int~~  |
+| `label`        | Label attached to the span. ~~str~~                           |
+| _Optional_     |                                                               |
+| `kb_id`        | `KnowledgeBase` ID. ~~str~~                                   |
+| `kb_url`       | `KnowledgeBase` URL. ~~str~~                                  |
+
+</Accordion>
+
 ### Visualizer options {id="displacy_options"}

 The `options` argument lets you specify additional settings for each visualizer.
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@ -349,7 +349,8 @@ or
 [SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
 If you set `manual=True` on either `render()` or `serve()`, you can pass in data
 in displaCy's format as a dictionary (instead of `Doc` objects). There are
-helper functions for converting `Doc` objects to displaCy's format for use with
+helper functions for converting `Doc` objects to
+[displaCy's format](/api/top-level#displacy_structures) for use with
 `manual=True`: [`displacy.parse_deps`](/api/top-level#displacy.parse_deps),
 [`displacy.parse_ents`](/api/top-level#displacy.parse_ents), and
 [`displacy.parse_spans`](/api/top-level#displacy.parse_spans).
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -17,6 +17,31 @@
            "category": ["extension"],
            "tags": []
        },
+        {
+            "id": "sayswho",
+            "title": "SaysWho",
+            "slogan": "Quote identification, attribution and resolution",
+            "description": "A Python package for identifying and attributing quotes in text. It uses a combination of spaCy functionality, logic and grammar to find quotes and their speakers, then uses the spaCy coreferencing model to better clarify who is speaking. Currently English only.",
+            "github": "afriedman412/sayswho",
+            "pip": "sayswho",
+            "code_language": "python",
+            "author": "Andy Friedman",
+            "author_links": {
+                "twitter": "@steadynappin",
+                "github": "afriedman412"
+            },
+            "code_example": [
+                "from sayswho import SaysWho",
+                "text = open(\"path/to/your/text_file.txt\").read()",
+                "sw = SaysWho()",
+                "sw.attribute(text)",
+
+                "sw.expand_match() # see quote/cluster matches",
+                "sw.render_to_html() # output your text, quotes and cluster matches to an html file called \"temp.html\""
+            ],
+            "category": ["standalone"],
+            "tags": ["attribution", "coref", "text-processing"]
+        },
        {
            "id": "parsigs",
            "title": "parsigs",
@ -67,6 +92,33 @@
            "category": ["pipeline", "research"],
            "tags": ["latin"]
        },
+        {
+            "id": "odycy",
+            "title": "OdyCy",
+            "slogan": "General-purpose language pipelines for premodern Greek.",
+            "description": "Academically validated modular NLP pipelines for premodern Greek. odyCy achieves state of the art performance on multiple tasks on unseen test data from the Universal Dependencies Perseus treebank, and performs second best on the PROIEL treebank’s test set on even more tasks. In addition performance also seems relatively stable across the two evaluation datasets in comparison with other NLP pipelines. OdyCy is being used at the Center for Humanities Computing for preprocessing and analyzing Ancient Greek corpora for New Testament research, meaning that you can expect consistent maintenance and improvements.",
+            "github": "centre-for-humanities-computing/odyCy",
+            "code_example": [
+                "# To install the high-accuracy transformer-based pipeline",
+                "# pip install https://huggingface.co/chcaa/grc_odycy_joint_trf/resolve/main/grc_odycy_joint_trf-any-py3-none-any.whl",
+                "import spacy",
+                "",
+                "nlp = spacy.load('grc_odycy_joint_trf')",
+                "",
+                "doc = nlp('τὴν γοῦν Ἀττικὴν ἐκ τοῦ ἐπὶ πλεῖστον διὰ τὸ λεπτόγεων ἀστασίαστον οὖσαν ἄνθρωποι ᾤκουν οἱ αὐτοὶ αἰεί.')"
+            ],
+            "code_language": "python",
+            "url": "https://centre-for-humanities-computing.github.io/odyCy/",
+            "thumb": "https://raw.githubusercontent.com/centre-for-humanities-computing/odyCy/7b94fec60679d06272dca88a4dcfe0f329779aea/docs/_static/logo.svg",
+            "image": "https://github.com/centre-for-humanities-computing/odyCy/raw/main/docs/_static/logo_with_text_below.svg",
+            "author": "Jan Kostkan, Márton Kardos (Center for Humanities Computing, Aarhus University)",
+            "author_links": {
+                "github": "centre-for-humanities-computing",
+                "website": "https://chc.au.dk/"
+            },
+            "category": ["pipeline", "standalone", "research"],
+            "tags": ["ancient Greek"]
+        },
        {
            "id": "spacy-wasm",
            "title": "spacy-wasm",