Merge branch 'upstream_master' into test-cli-app-init-config

This commit is contained in:
svlandeg 2023-07-31 11:09:09 +02:00
commit d67807d91f
10 changed files with 385 additions and 9 deletions

View File

@ -14,6 +14,7 @@ from .debug_diff import debug_diff # noqa: F401
from .debug_model import debug_model # noqa: F401
from .download import download # noqa: F401
from .evaluate import evaluate # noqa: F401
from .find_function import find_function # noqa: F401
from .find_threshold import find_threshold # noqa: F401
from .info import info # noqa: F401
from .init_config import fill_config, init_config # noqa: F401

View File

@ -0,0 +1,69 @@
from typing import Optional, Tuple
from catalogue import RegistryError
from wasabi import msg
from ..util import registry
from ._util import Arg, Opt, app
@app.command("find-function")
def find_function_cli(
# fmt: off
func_name: str = Arg(..., help="Name of the registered function."),
registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."),
# fmt: on
):
"""
Find the module, path and line number to the file the registered
function is defined in, if available.
func_name (str): Name of the registered function.
registry_name (Optional[str]): Name of the catalogue registry.
DOCS: https://spacy.io/api/cli#find-function
"""
if not registry_name:
registry_names = registry.get_registry_names()
for name in registry_names:
if registry.has(name, func_name):
registry_name = name
break
if not registry_name:
msg.fail(
f"Couldn't find registered function: '{func_name}'",
exits=1,
)
assert registry_name is not None
find_function(func_name, registry_name)
def find_function(func_name: str, registry_name: str) -> Tuple[str, int]:
registry_desc = None
try:
registry_desc = registry.find(registry_name, func_name)
except RegistryError as e:
msg.fail(
f"Couldn't find registered function: '{func_name}' in registry '{registry_name}'",
)
msg.fail(f"{e}", exits=1)
assert registry_desc is not None
registry_path = None
line_no = None
if registry_desc["file"]:
registry_path = registry_desc["file"]
line_no = registry_desc["line_no"]
if not registry_path or not line_no:
msg.fail(
f"Couldn't find path to registered function: '{func_name}' in registry '{registry_name}'",
exits=1,
)
assert registry_path is not None
assert line_no is not None
msg.good(f"Found registered function '{func_name}' at {registry_path}:{line_no}")
return str(registry_path), int(line_no)

View File

@ -313,6 +313,8 @@ class DependencyRenderer:
self.lang = settings.get("lang", DEFAULT_LANG)
render_id = f"{id_prefix}-{i}"
svg = self.render_svg(render_id, p["words"], p["arcs"])
if p.get("title"):
svg = TPL_TITLE.format(title=p.get("title")) + svg
rendered.append(svg)
if page:
content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])

View File

@ -235,6 +235,40 @@ def test_project_push_pull(project_dir):
assert test_file.is_file()
def test_find_function_valid():
# example of architecture in main code base
function = "spacy.TextCatBOW.v2"
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
assert f"Found registered function '{function}'" in result.stdout
assert "textcat.py" in result.stdout
result = CliRunner().invoke(app, ["find-function", function])
assert f"Found registered function '{function}'" in result.stdout
assert "textcat.py" in result.stdout
# example of architecture in spacy-legacy
function = "spacy.TextCatBOW.v1"
result = CliRunner().invoke(app, ["find-function", function])
assert f"Found registered function '{function}'" in result.stdout
assert "spacy_legacy" in result.stdout
assert "textcat.py" in result.stdout
def test_find_function_invalid():
# invalid registry
function = "spacy.TextCatBOW.v2"
registry = "foobar"
result = CliRunner().invoke(
app, ["find-function", function, "--registry", registry]
)
assert f"Unknown function registry: '{registry}'" in result.stdout
# invalid function
function = "spacy.TextCatBOW.v666"
result = CliRunner().invoke(app, ["find-function", function])
assert f"Couldn't find registered function: '{function}'" in result.stdout
example_words_1 = ["I", "like", "cats"]
example_words_2 = ["I", "like", "dogs"]
example_lemmas_1 = ["I", "like", "cat"]

View File

@ -350,6 +350,78 @@ def test_displacy_render_wrapper(en_vocab):
displacy.set_render_wrapper(lambda html: html)
def test_displacy_render_manual_dep():
"""Test displacy.render with manual data for dep style"""
parsed_dep = {
"words": [
{"text": "This", "tag": "DT"},
{"text": "is", "tag": "VBZ"},
{"text": "a", "tag": "DT"},
{"text": "sentence", "tag": "NN"},
],
"arcs": [
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
{"start": 2, "end": 3, "label": "det", "dir": "left"},
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
],
"title": "Title",
}
html = displacy.render([parsed_dep], style="dep", manual=True)
for word in parsed_dep["words"]:
assert word["text"] in html
assert word["tag"] in html
def test_displacy_render_manual_ent():
"""Test displacy.render with manual data for ent style"""
parsed_ents = [
{
"text": "But Google is starting from behind.",
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
},
{
"text": "But Google is starting from behind.",
"ents": [{"start": -100, "end": 100, "label": "COMPANY"}],
"title": "Title",
},
]
html = displacy.render(parsed_ents, style="ent", manual=True)
for parsed_ent in parsed_ents:
assert parsed_ent["ents"][0]["label"] in html
if "title" in parsed_ent:
assert parsed_ent["title"] in html
def test_displacy_render_manual_span():
"""Test displacy.render with manual data for span style"""
parsed_spans = [
{
"text": "Welcome to the Bank of China.",
"spans": [
{"start_token": 3, "end_token": 6, "label": "ORG"},
{"start_token": 5, "end_token": 6, "label": "GPE"},
],
"tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
},
{
"text": "Welcome to the Bank of China.",
"spans": [
{"start_token": 3, "end_token": 6, "label": "ORG"},
{"start_token": 5, "end_token": 6, "label": "GPE"},
],
"tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
"title": "Title",
},
]
html = displacy.render(parsed_spans, style="span", manual=True)
for parsed_span in parsed_spans:
assert parsed_span["spans"][0]["label"] in html
if "title" in parsed_span:
assert parsed_span["title"] in html
def test_displacy_options_case():
ents = ["foo", "BAR"]
colors = {"FOO": "red", "bar": "green"}

View File

@ -7,6 +7,7 @@ menu:
- ['info', 'info']
- ['validate', 'validate']
- ['init', 'init']
- ['find-function', 'find-function']
- ['convert', 'convert']
- ['debug', 'debug']
- ['train', 'train']
@ -251,6 +252,27 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The label files. |
## find-function {id="find-function",version="3.7",tag="command"}
Find the module, path and line number to the file for a given registered
function. This functionality is helpful to understand where registered
functions, as used in the config file, are defined.
```bash
$ python -m spacy find-function [func_name] [--registry]
```
> #### Example
>
> ```bash
> $ python -m spacy find-function spacy.TextCatBOW.v1
> ```
| Name | Description |
| ------------------ | ----------------------------------------------------- |
| `func_name` | Name of the registered function. ~~str (positional)~~ |
| `--registry`, `-r` | Name of the catalogue registry. ~~str (option)~~ |
## convert {id="convert",tag="command"}
Convert files into spaCy's
@ -1651,10 +1673,10 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
> $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl
> ```
| Name | Description |
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
| **UPLOADS** | The pipeline to the hub. |
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
| **UPLOADS** | The pipeline to the hub. |

View File

@ -67,7 +67,6 @@ architectures and their arguments and hyperparameters.
> ```python
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
> config = {
> "threshold": 0.5,
> "spans_key": "labeled_spans",
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},

View File

@ -343,6 +343,130 @@ use with the `manual=True` argument in `displacy.render`.
| `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ |
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
### Visualizer data structures {id="displacy_structures"}
You can use displaCy's data format to manually render data. This can be useful
if you want to visualize output from other libraries. You can find examples of
displaCy's different data formats below.
> #### DEP example data structure
>
> ```json
> {
> "words": [
> { "text": "This", "tag": "DT" },
> { "text": "is", "tag": "VBZ" },
> { "text": "a", "tag": "DT" },
> { "text": "sentence", "tag": "NN" }
> ],
> "arcs": [
> { "start": 0, "end": 1, "label": "nsubj", "dir": "left" },
> { "start": 2, "end": 3, "label": "det", "dir": "left" },
> { "start": 1, "end": 3, "label": "attr", "dir": "right" }
> ]
> }
> ```
#### Dependency Visualizer data structure {id="structure-dep"}
| Dictionary Key | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------- |
| `words` | List of dictionaries describing a word token (see structure below). ~~List[Dict[str, Any]]~~ |
| `arcs` | List of dictionaries describing the relations between words (see structure below). ~~List[Dict[str, Any]]~~ |
| _Optional_ | |
| `title` | Title of the visualization. ~~Optional[str]~~ |
| `settings` | Dependency Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
<Accordion title="Words data structure">
| Dictionary Key | Description |
| -------------- | ---------------------------------------- |
| `text` | Text content of the word. ~~str~~ |
| `tag` | Fine-grained part-of-speech. ~~str~~ |
| `lemma` | Base form of the word. ~~Optional[str]~~ |
</Accordion>
<Accordion title="Arcs data structure">
| Dictionary Key | Description |
| -------------- | ---------------------------------------------------- |
| `start` | The index of the starting token. ~~int~~ |
| `end` | The index of the ending token. ~~int~~ |
| `label` | The type of dependency relation. ~~str~~ |
| `dir` | Direction of the relation (`left`, `right`). ~~str~~ |
</Accordion>
> #### ENT example data structure
>
> ```json
> {
> "text": "But Google is starting from behind.",
> "ents": [{ "start": 4, "end": 10, "label": "ORG" }]
> }
> ```
#### Named Entity Recognition data structure {id="structure-ent"}
| Dictionary Key | Description |
| -------------- | ------------------------------------------------------------------------------------------- |
| `text` | String representation of the document text. ~~str~~ |
| `ents` | List of dictionaries describing entities (see structure below). ~~List[Dict[str, Any]]~~ |
| _Optional_ | |
| `title` | Title of the visualization. ~~Optional[str]~~ |
| `settings` | Entity Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
<Accordion title="Ents data structure">
| Dictionary Key | Description |
| -------------- | ---------------------------------------------------------------------- |
| `start` | The index of the first character of the entity. ~~int~~ |
| `end` | The index of the last character of the entity. (not inclusive) ~~int~~ |
| `label` | Label attached to the entity. ~~str~~ |
| _Optional_ | |
| `kb_id` | `KnowledgeBase` ID. ~~str~~ |
| `kb_url` | `KnowledgeBase` URL. ~~str~~ |
</Accordion>
> #### SPAN example data structure
>
> ```json
> {
> "text": "Welcome to the Bank of China.",
> "spans": [
> { "start_token": 3, "end_token": 6, "label": "ORG" },
> { "start_token": 5, "end_token": 6, "label": "GPE" }
> ],
> "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."]
> }
> ```
#### Span Classification data structure {id="structure-span"}
| Dictionary Key | Description |
| -------------- | ----------------------------------------------------------------------------------------- |
| `text` | String representation of the document text. ~~str~~ |
| `spans` | List of dictionaries describing spans (see structure below). ~~List[Dict[str, Any]]~~ |
| `tokens` | List of word tokens. ~~List[str]~~ |
| _Optional_ | |
| `title` | Title of the visualization. ~~Optional[str]~~ |
| `settings` | Span Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
<Accordion title="Spans data structure">
| Dictionary Key | Description |
| -------------- | ------------------------------------------------------------- |
| `start_token` | The index of the first token of the span in `tokens`. ~~int~~ |
| `end_token` | The index of the last token of the span in `tokens`. ~~int~~ |
| `label` | Label attached to the span. ~~str~~ |
| _Optional_ | |
| `kb_id` | `KnowledgeBase` ID. ~~str~~ |
| `kb_url` | `KnowledgeBase` URL. ~~str~~ |
</Accordion>
### Visualizer options {id="displacy_options"}
The `options` argument lets you specify additional settings for each visualizer.

View File

@ -349,7 +349,8 @@ or
[SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
If you set `manual=True` on either `render()` or `serve()`, you can pass in data
in displaCy's format as a dictionary (instead of `Doc` objects). There are
helper functions for converting `Doc` objects to displaCy's format for use with
helper functions for converting `Doc` objects to
[displaCy's format](/api/top-level#displacy_structures) for use with
`manual=True`: [`displacy.parse_deps`](/api/top-level#displacy.parse_deps),
[`displacy.parse_ents`](/api/top-level#displacy.parse_ents), and
[`displacy.parse_spans`](/api/top-level#displacy.parse_spans).

View File

@ -17,6 +17,31 @@
"category": ["extension"],
"tags": []
},
{
"id": "sayswho",
"title": "SaysWho",
"slogan": "Quote identification, attribution and resolution",
"description": "A Python package for identifying and attributing quotes in text. It uses a combination of spaCy functionality, logic and grammar to find quotes and their speakers, then uses the spaCy coreferencing model to better clarify who is speaking. Currently English only.",
"github": "afriedman412/sayswho",
"pip": "sayswho",
"code_language": "python",
"author": "Andy Friedman",
"author_links": {
"twitter": "@steadynappin",
"github": "afriedman412"
},
"code_example": [
"from sayswho import SaysWho",
"text = open(\"path/to/your/text_file.txt\").read()",
"sw = SaysWho()",
"sw.attribute(text)",
"sw.expand_match() # see quote/cluster matches",
"sw.render_to_html() # output your text, quotes and cluster matches to an html file called \"temp.html\""
],
"category": ["standalone"],
"tags": ["attribution", "coref", "text-processing"]
},
{
"id": "parsigs",
"title": "parsigs",
@ -67,6 +92,33 @@
"category": ["pipeline", "research"],
"tags": ["latin"]
},
{
"id": "odycy",
"title": "OdyCy",
"slogan": "General-purpose language pipelines for premodern Greek.",
"description": "Academically validated modular NLP pipelines for premodern Greek. odyCy achieves state of the art performance on multiple tasks on unseen test data from the Universal Dependencies Perseus treebank, and performs second best on the PROIEL treebanks test set on even more tasks. In addition performance also seems relatively stable across the two evaluation datasets in comparison with other NLP pipelines. OdyCy is being used at the Center for Humanities Computing for preprocessing and analyzing Ancient Greek corpora for New Testament research, meaning that you can expect consistent maintenance and improvements.",
"github": "centre-for-humanities-computing/odyCy",
"code_example": [
"# To install the high-accuracy transformer-based pipeline",
"# pip install https://huggingface.co/chcaa/grc_odycy_joint_trf/resolve/main/grc_odycy_joint_trf-any-py3-none-any.whl",
"import spacy",
"",
"nlp = spacy.load('grc_odycy_joint_trf')",
"",
"doc = nlp('τὴν γοῦν Ἀττικὴν ἐκ τοῦ ἐπὶ πλεῖστον διὰ τὸ λεπτόγεων ἀστασίαστον οὖσαν ἄνθρωποι ᾤκουν οἱ αὐτοὶ αἰεί.')"
],
"code_language": "python",
"url": "https://centre-for-humanities-computing.github.io/odyCy/",
"thumb": "https://raw.githubusercontent.com/centre-for-humanities-computing/odyCy/7b94fec60679d06272dca88a4dcfe0f329779aea/docs/_static/logo.svg",
"image": "https://github.com/centre-for-humanities-computing/odyCy/raw/main/docs/_static/logo_with_text_below.svg",
"author": "Jan Kostkan, Márton Kardos (Center for Humanities Computing, Aarhus University)",
"author_links": {
"github": "centre-for-humanities-computing",
"website": "https://chc.au.dk/"
},
"category": ["pipeline", "standalone", "research"],
"tags": ["ancient Greek"]
},
{
"id": "spacy-wasm",
"title": "spacy-wasm",