mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 17:41:59 +03:00
Merge branch 'master' into spacy-vscode-extension
This commit is contained in:
commit
88ac2c95ea
54
.github/workflows/tests.yml
vendored
54
.github/workflows/tests.yml
vendored
|
@ -107,22 +107,22 @@ jobs:
|
|||
- name: Test import
|
||||
run: python -W error -c "import spacy"
|
||||
|
||||
- name: "Test download CLI"
|
||||
run: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test download_url in info CLI"
|
||||
run: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test no warnings on load (#11713)"
|
||||
run: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
# - name: "Test download CLI"
|
||||
# run: |
|
||||
# python -m spacy download ca_core_news_sm
|
||||
# python -m spacy download ca_core_news_md
|
||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test download_url in info CLI"
|
||||
# run: |
|
||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test no warnings on load (#11713)"
|
||||
# run: |
|
||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
# if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test convert CLI"
|
||||
run: |
|
||||
|
@ -146,17 +146,17 @@ jobs:
|
|||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI vectors warning"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
if: matrix.python_version == '3.9'
|
||||
# - name: "Test assemble CLI"
|
||||
# run: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test assemble CLI vectors warning"
|
||||
# run: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
# if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Install test requirements"
|
||||
run: |
|
||||
|
|
|
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
|
|||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.8.0
|
||||
typer>=0.3.0,<0.10.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
# Third party dependencies
|
||||
|
|
|
@ -52,7 +52,7 @@ install_requires =
|
|||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.8.0
|
||||
typer>=0.3.0,<0.10.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.5.0"
|
||||
__version__ = "3.6.0.dev0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -81,11 +81,8 @@ def download(
|
|||
|
||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||
egg_tpl = "#egg={m}=={v}"
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||
if sdist:
|
||||
filename += egg_tpl.format(m=model_name, v=version)
|
||||
return filename
|
||||
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ def evaluate_cli(
|
|||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -50,6 +51,7 @@ def evaluate_cli(
|
|||
gold_preproc=gold_preproc,
|
||||
displacy_path=displacy_path,
|
||||
displacy_limit=displacy_limit,
|
||||
per_component=per_component,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
@ -64,6 +66,7 @@ def evaluate(
|
|||
displacy_limit: int = 25,
|
||||
silent: bool = True,
|
||||
spans_key: str = "sc",
|
||||
per_component: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
fix_random_seed()
|
||||
|
@ -78,44 +81,53 @@ def evaluate(
|
|||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
nlp = util.load_model(model)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
scores = nlp.evaluate(dev_dataset)
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"TEXTCAT": "cats_score",
|
||||
"SENT P": "sents_p",
|
||||
"SENT R": "sents_r",
|
||||
"SENT F": "sents_f",
|
||||
"SPAN P": f"spans_{spans_key}_p",
|
||||
"SPAN R": f"spans_{spans_key}_r",
|
||||
"SPAN F": f"spans_{spans_key}_f",
|
||||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
scores = nlp.evaluate(dev_dataset, per_component=per_component)
|
||||
if per_component:
|
||||
data = scores
|
||||
if output is None:
|
||||
msg.warn(
|
||||
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
|
||||
)
|
||||
else:
|
||||
msg.info("Per-component scores will be saved to output JSON file.")
|
||||
else:
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"TEXTCAT": "cats_score",
|
||||
"SENT P": "sents_p",
|
||||
"SENT R": "sents_r",
|
||||
"SENT F": "sents_f",
|
||||
"SPAN P": f"spans_{spans_key}_p",
|
||||
"SPAN R": f"spans_{spans_key}_r",
|
||||
"SPAN F": f"spans_{spans_key}_f",
|
||||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
else:
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
|
||||
msg.table(results, title="Results")
|
||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||
msg.table(results, title="Results")
|
||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||
|
||||
if displacy_path:
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
|
|
|
@ -1372,6 +1372,7 @@ class Language:
|
|||
scorer: Optional[Scorer] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
scorer_cfg: Optional[Dict[str, Any]] = None,
|
||||
per_component: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Evaluate a model's pipeline components.
|
||||
|
||||
|
@ -1383,6 +1384,8 @@ class Language:
|
|||
arguments for specific components.
|
||||
scorer_cfg (dict): An optional dictionary with extra keyword arguments
|
||||
for the scorer.
|
||||
per_component (bool): Whether to return the scores keyed by component
|
||||
name. Defaults to False.
|
||||
|
||||
RETURNS (Scorer): The scorer containing the evaluation results.
|
||||
|
||||
|
@ -1415,7 +1418,7 @@ class Language:
|
|||
for eg, doc in zip(examples, docs):
|
||||
eg.predicted = doc
|
||||
end_time = timer()
|
||||
results = scorer.score(examples)
|
||||
results = scorer.score(examples, per_component=per_component)
|
||||
n_words = sum(len(eg.predicted) for eg in examples)
|
||||
results["speed"] = n_words / (end_time - start_time)
|
||||
return results
|
||||
|
|
|
@ -121,20 +121,30 @@ class Scorer:
|
|||
nlp.add_pipe(pipe)
|
||||
self.nlp = nlp
|
||||
|
||||
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
||||
def score(
|
||||
self, examples: Iterable[Example], *, per_component: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""Evaluate a list of Examples.
|
||||
|
||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||
per_component (bool): Whether to return the scores keyed by component
|
||||
name. Defaults to False.
|
||||
RETURNS (Dict): A dictionary of scores.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score
|
||||
"""
|
||||
scores = {}
|
||||
if hasattr(self.nlp.tokenizer, "score"):
|
||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
|
||||
if per_component:
|
||||
scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg)
|
||||
else:
|
||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
|
||||
for name, component in self.nlp.pipeline:
|
||||
if hasattr(component, "score"):
|
||||
scores.update(component.score(examples, **self.cfg))
|
||||
if per_component:
|
||||
scores[name] = component.score(examples, **self.cfg)
|
||||
else:
|
||||
scores.update(component.score(examples, **self.cfg))
|
||||
return scores
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -115,6 +115,14 @@ def test_tokenization(sented_doc):
|
|||
assert scores["token_r"] == approx(0.33333333)
|
||||
assert scores["token_f"] == 0.4
|
||||
|
||||
# per-component scoring
|
||||
scorer = Scorer()
|
||||
scores = scorer.score([example], per_component=True)
|
||||
assert scores["tokenizer"]["token_acc"] == 0.5
|
||||
assert scores["tokenizer"]["token_p"] == 0.5
|
||||
assert scores["tokenizer"]["token_r"] == approx(0.33333333)
|
||||
assert scores["tokenizer"]["token_f"] == 0.4
|
||||
|
||||
|
||||
def test_sents(sented_doc):
|
||||
scorer = Scorer()
|
||||
|
@ -278,6 +286,13 @@ def test_tag_score(tagged_doc):
|
|||
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
|
||||
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
||||
|
||||
# per-component scoring
|
||||
scorer = Scorer()
|
||||
results = scorer.score([example], per_component=True)
|
||||
assert results["tagger"]["tag_acc"] == 0.9
|
||||
assert results["morphologizer"]["pos_acc"] == 0.9
|
||||
assert results["morphologizer"]["morph_acc"] == approx(0.8)
|
||||
|
||||
|
||||
def test_partial_annotation(en_tokenizer):
|
||||
pred_doc = en_tokenizer("a b c d e")
|
||||
|
|
|
@ -133,10 +133,11 @@ def init_vocab(
|
|||
logger.info("Added vectors: %s", vectors)
|
||||
# warn if source model vectors are not identical
|
||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
||||
if vectors_hash != sourced_vectors_hash:
|
||||
warnings.warn(Warnings.W113.format(name=sourced_component))
|
||||
if len(sourced_vectors_hashes) > 0:
|
||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
||||
if vectors_hash != sourced_vectors_hash:
|
||||
warnings.warn(Warnings.W113.format(name=sourced_component))
|
||||
logger.info("Finished initializing nlp object")
|
||||
|
||||
|
||||
|
|
|
@ -1163,18 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the
|
|||
$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
|
||||
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||
| Name | Description |
|
||||
| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
|
||||
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
|
||||
| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||
|
||||
### speed {id="benchmark-speed", version="3.5", tag="command"}
|
||||
|
||||
|
@ -1640,7 +1641,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details,
|
|||
see the spaCy project [integration](/usage/projects#huggingface_hub).
|
||||
|
||||
```bash
|
||||
$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose]
|
||||
$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
|
@ -1654,6 +1655,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo]
|
|||
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
|
||||
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
|
||||
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
|
||||
| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ |
|
||||
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
|
||||
| **UPLOADS** | The pipeline to the hub. |
|
||||
|
|
|
@ -64,7 +64,7 @@ architectures and their arguments and hyperparameters.
|
|||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||
| `generate_empty_kb` <Tag variant="new">3.5.1</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
|
|
|
@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
|
|||
> print(scores)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The batch size to use. ~~Optional[int]~~ |
|
||||
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
|
||||
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| Name | Description |
|
||||
| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The batch size to use. ~~Optional[int]~~ |
|
||||
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Language.use_params {id="use_params",tag="contextmanager, method"}
|
||||
|
||||
|
|
|
@ -213,11 +213,11 @@ Retrieve values for a feature by field.
|
|||
> assert morph.get("Feat1") == ["Val1", "Val2"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `field` | The field to retrieve. ~~str~~ |
|
||||
| `default` <Tag variant="new">3.6</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
|
||||
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `field` | The field to retrieve. ~~str~~ |
|
||||
| `default` <Tag variant="new">3.5.3</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
|
||||
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
||||
|
||||
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ Create a new `Scorer`.
|
|||
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
|
||||
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
||||
| `**kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
||||
|
||||
## Scorer.score {id="score",tag="method"}
|
||||
|
||||
|
@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or
|
|||
> scores = scorer.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| Name | Description |
|
||||
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"}
|
||||
|
||||
|
|
|
@ -469,7 +469,7 @@ factories.
|
|||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
|
||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||
| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. |
|
||||
| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`. |
|
||||
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
|
||||
|
||||
### spacy-transformers registry {id="registry-transformers"}
|
||||
|
|
|
@ -56,14 +56,19 @@ wrap. So if you come across this problem, especially when using custom labels,
|
|||
you'll have to increase the `distance` setting in the `options` to allow longer
|
||||
arcs.
|
||||
|
||||
Moreover, you might need to modify the `offset_x` argument depending on the shape
|
||||
of your document. Otherwise, the left part of the document may overflow beyond the
|
||||
container's border.
|
||||
|
||||
</Infobox>
|
||||
|
||||
| Argument | Description |
|
||||
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| Argument | Description |
|
||||
| ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| `offset_x` | Spacing on left side of the SVG in px. You might need to tweak this setting for long texts. Defaults to `50`. ~~int~~ |
|
||||
|
||||
For a list of all available options, see the
|
||||
[`displacy` API documentation](/api/top-level#displacy_options).
|
||||
|
|
|
@ -17,6 +17,56 @@
|
|||
"category": ["extension"],
|
||||
"tags": []
|
||||
},
|
||||
{
|
||||
"id": "parsigs",
|
||||
"title": "parsigs",
|
||||
"slogan": "Structuring prescriptions text made simple using spaCy",
|
||||
"description": "Parsigs is an open-source project that aims to extract the relevant dosage information from prescriptions text without compromising the patient's privacy.\n\nNotice you also need to install the model in order to use the package: `pip install https://huggingface.co/royashcenazi/en_parsigs/resolve/main/en_parsigs-any-py3-none-any.whl`",
|
||||
"github": "royashcenazi/parsigs",
|
||||
"pip": "parsigs",
|
||||
"code_language": "python",
|
||||
"author": "Roy Ashcenazi",
|
||||
"code_example": [
|
||||
"# You'll need to install the trained model, see instructions in the description section",
|
||||
"from parsigs.parse_sig_api import StructuredSig, SigParser",
|
||||
"sig_parser = SigParser()",
|
||||
"",
|
||||
"sig = 'Take 1 tablet of ibuprofen 200mg 3 times every day for 3 weeks'",
|
||||
"parsed_sig = sig_parser.parse(sig)"
|
||||
],
|
||||
"author_links": {
|
||||
"github": "royashcenazi"
|
||||
},
|
||||
"category": ["model", "research", "biomedical"],
|
||||
"tags": ["sigs", "prescription","pharma"]
|
||||
},
|
||||
{
|
||||
"id": "latincy",
|
||||
"title": "LatinCy",
|
||||
"thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png",
|
||||
"slogan": "Synthetic trained spaCy pipelines for Latin NLP",
|
||||
"description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.",
|
||||
"url": "https://huggingface.co/latincy",
|
||||
"code_example": [
|
||||
"# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
|
||||
"import spacy",
|
||||
"nlp = spacy.load('la_core_web_lg')",
|
||||
"doc = nlp('Haec narrantur a poetis de Perseo')",
|
||||
"",
|
||||
"print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
|
||||
"",
|
||||
"# > Haec, haec, hic, DET"
|
||||
],
|
||||
"code_language": "python",
|
||||
"author": "Patrick J. Burns",
|
||||
"author_links": {
|
||||
"twitter": "@diyclassics",
|
||||
"github": "diyclassics",
|
||||
"website": "https://diyclassics.github.io/"
|
||||
},
|
||||
"category": ["pipeline", "research"],
|
||||
"tags": ["latin"]
|
||||
},
|
||||
{
|
||||
"id": "spacy-wasm",
|
||||
"title": "spacy-wasm",
|
||||
|
@ -2827,6 +2877,58 @@
|
|||
"tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"],
|
||||
"spacy_version": 3
|
||||
},
|
||||
{
|
||||
"id": "adeptaugmentations",
|
||||
"title": "Adept Augmentations",
|
||||
"slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.",
|
||||
"description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".",
|
||||
"github": "argilla-io/adept-augmentations",
|
||||
"pip": "adept-augmentations",
|
||||
"thumb": "https://raw.githubusercontent.com/argilla-io/adept-augmentations/main/logo.png",
|
||||
"code_example": [
|
||||
"from adept_augmentations import EntitySwapAugmenter",
|
||||
"import spacy",
|
||||
"from spacy.tokens import Doc, DocBin",
|
||||
"nlp = spacy.blank(\"en\")",
|
||||
"",
|
||||
"# Create some example golden data",
|
||||
"example_data = [",
|
||||
" (\"Apple is looking at buying U.K. startup for $1 billion\", [(0, 5, \"ORG\"), (27, 31, \"LOC\"), (44, 54, \"MONEY\")]),",
|
||||
" (\"Microsoft acquires GitHub for $7.5 billion\", [(0, 9, \"ORG\"), (19, 25, \"ORG\"), (30, 42, \"MONEY\")]),",
|
||||
"]",
|
||||
"",
|
||||
"# Create a new DocBin",
|
||||
"nlp = spacy.blank(\"en\")",
|
||||
"docs = []",
|
||||
"for entry in example_data:",
|
||||
" doc = Doc(nlp.vocab, words=entry[0].split())",
|
||||
" doc.ents = [doc.char_span(ent[0], ent[1], label=ent[2]) for ent in entry[1]]",
|
||||
" docs.append(doc)",
|
||||
"golden_dataset = DocBin(docs=docs)",
|
||||
"",
|
||||
"# Augment Data",
|
||||
"augmented_dataset = EntitySwapAugmenter(golden_dataset).augment(4)",
|
||||
"for doc in augmented_dataset.get_docs(nlp.vocab):",
|
||||
" print(doc.text)",
|
||||
"",
|
||||
"# GitHub is looking at buying U.K. startup for $ 7.5 billion",
|
||||
"# Microsoft is looking at buying U.K. startup for $ 1 billion",
|
||||
"# Microsoft is looking at buying U.K. startup for $ 7.5 billion",
|
||||
"# GitHub is looking at buying U.K. startup for $ 1 billion",
|
||||
"# Microsoft acquires Apple for $ 7.5 billion",
|
||||
"# Apple acquires Microsoft for $ 1 billion",
|
||||
"# Microsoft acquires Microsoft for $ 7.5 billion",
|
||||
"# GitHub acquires GitHub for $ 1 billion"
|
||||
],
|
||||
"author": "David Berenstein",
|
||||
"author_links": {
|
||||
"github": "davidberenstein1957",
|
||||
"website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
|
||||
},
|
||||
"category": ["standalone"],
|
||||
"tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
|
||||
"spacy_version": 3
|
||||
},
|
||||
{
|
||||
"id": "blackstone",
|
||||
"title": "Blackstone",
|
||||
|
|
Loading…
Reference in New Issue
Block a user