Merge remote-tracking branch 'upstream/master' into add-span-finder

This commit is contained in:
Adriane Boyd 2023-06-02 20:01:37 +02:00
commit 9c403f1f30
48 changed files with 4453 additions and 164 deletions

View File

@ -107,22 +107,22 @@ jobs:
- name: Test import - name: Test import
run: python -W error -c "import spacy" run: python -W error -c "import spacy"
- name: "Test download CLI" # - name: "Test download CLI"
run: | # run: |
python -m spacy download ca_core_news_sm # python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md # python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" # python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
#
- name: "Test download_url in info CLI" # - name: "Test download_url in info CLI"
run: | # run: |
python -W error -m spacy info ca_core_news_sm | grep -q download_url # python -W error -m spacy info ca_core_news_sm | grep -q download_url
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
#
- name: "Test no warnings on load (#11713)" # - name: "Test no warnings on load (#11713)"
run: | # run: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" # python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
- name: "Test convert CLI" - name: "Test convert CLI"
run: | run: |
@ -146,17 +146,17 @@ jobs:
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
if: matrix.python_version == '3.9' if: matrix.python_version == '3.9'
- name: "Test assemble CLI" # - name: "Test assemble CLI"
run: | # run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" # python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir # PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
#
- name: "Test assemble CLI vectors warning" # - name: "Test assemble CLI vectors warning"
run: | # run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" # python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 # python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
- name: "Install test requirements" - name: "Install test requirements"
run: | run: |
@ -165,6 +165,7 @@ jobs:
- name: "Run CPU tests" - name: "Run CPU tests"
run: | run: |
python -m pytest --pyargs spacy -W error python -m pytest --pyargs spacy -W error
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
- name: "Run CPU tests with thinc-apple-ops" - name: "Run CPU tests with thinc-apple-ops"
run: | run: |

View File

@ -35,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos
## 📖 Documentation ## 📖 Documentation
| Documentation | | | Documentation | |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------- | ---------------------------------------------------------------------- |
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | | ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
| 📚 **[Usage Guides]** | How to use spaCy and its features. | | 📚 **[Usage Guides]** | How to use spaCy and its features. |
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | | 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. | | 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | | 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
| 📦 **[Models]** | Download trained pipelines for spaCy. | | 📦 **[Models]** | Download trained pipelines for spaCy. |
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | | 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
| 🛠 **[Changelog]** | Changes and version history. | | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | | 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** | | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** | | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
@ -57,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
[api reference]: https://spacy.io/api/ [api reference]: https://spacy.io/api/
[models]: https://spacy.io/models [models]: https://spacy.io/models
[universe]: https://spacy.io/universe [universe]: https://spacy.io/universe
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI [videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io [online course]: https://course.spacy.io
[project templates]: https://github.com/explosion/projects [project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog [changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
## 💬 Where to ask questions ## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).

View File

@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.10.0
pathy>=0.10.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
# Third party dependencies # Third party dependencies

View File

@ -52,7 +52,7 @@ install_requires =
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.10.0
pathy>=0.10.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.5.0" __version__ = "3.6.0.dev0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -81,11 +81,8 @@ def download(
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}" dl_tpl = "{m}-{v}/{m}-{v}{s}"
egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix) filename = dl_tpl.format(m=model_name, v=version, s=suffix)
if sdist:
filename += egg_tpl.format(m=model_name, v=version)
return filename return filename

View File

@ -27,6 +27,7 @@ def evaluate_cli(
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
# fmt: on # fmt: on
): ):
""" """
@ -50,6 +51,7 @@ def evaluate_cli(
gold_preproc=gold_preproc, gold_preproc=gold_preproc,
displacy_path=displacy_path, displacy_path=displacy_path,
displacy_limit=displacy_limit, displacy_limit=displacy_limit,
per_component=per_component,
silent=False, silent=False,
) )
@ -64,6 +66,7 @@ def evaluate(
displacy_limit: int = 25, displacy_limit: int = 25,
silent: bool = True, silent: bool = True,
spans_key: str = "sc", spans_key: str = "sc",
per_component: bool = False,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent) msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed() fix_random_seed()
@ -78,50 +81,61 @@ def evaluate(
corpus = Corpus(data_path, gold_preproc=gold_preproc) corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model) nlp = util.load_model(model)
dev_dataset = list(corpus(nlp)) dev_dataset = list(corpus(nlp))
scores = nlp.evaluate(dev_dataset) scores = nlp.evaluate(dev_dataset, per_component=per_component)
metrics = { if per_component:
"TOK": "token_acc", data = scores
"TAG": "tag_acc", if output is None:
"POS": "pos_acc", msg.warn(
"MORPH": "morph_acc", "The per-component option is enabled but there is no output JSON file provided to save the scores to."
"LEMMA": "lemma_acc", )
"UAS": "dep_uas", else:
"LAS": "dep_las", msg.info("Per-component scores will be saved to output JSON file.")
"NER P": "ents_p", else:
"NER R": "ents_r", metrics = {
"NER F": "ents_f", "TOK": "token_acc",
"TEXTCAT": "cats_score", "TAG": "tag_acc",
"SENT P": "sents_p", "POS": "pos_acc",
"SENT R": "sents_r", "MORPH": "morph_acc",
"SENT F": "sents_f", "LEMMA": "lemma_acc",
"SPAN P": f"spans_{spans_key}_p", "UAS": "dep_uas",
"SPAN R": f"spans_{spans_key}_r", "LAS": "dep_las",
"SPAN F": f"spans_{spans_key}_f", "NER P": "ents_p",
"SPEED": "speed", "NER R": "ents_r",
} "NER F": "ents_f",
results = {} "TEXTCAT": "cats_score",
data = {} "SENT P": "sents_p",
for metric, key in metrics.items(): "SENT R": "sents_r",
if key in scores: "SENT F": "sents_f",
if key == "cats_score": "SPAN P": f"spans_{spans_key}_p",
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" "SPAN R": f"spans_{spans_key}_r",
if isinstance(scores[key], (int, float)): "SPAN F": f"spans_{spans_key}_f",
if key == "speed": "SPEED": "speed",
results[metric] = f"{scores[key]:.0f}" }
results = {}
data = {}
for metric, key in metrics.items():
if key in scores:
if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if isinstance(scores[key], (int, float)):
if key == "speed":
results[metric] = f"{scores[key]:.0f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
else: else:
results[metric] = f"{scores[key]*100:.2f}" results[metric] = "-"
else: data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
results[metric] = "-"
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
msg.table(results, title="Results") msg.table(results, title="Results")
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
if displacy_path: if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
render_deps = "parser" in factory_names render_deps = "parser" in factory_names
render_ents = "ner" in factory_names render_ents = "ner" in factory_names
render_spans = "spancat" in factory_names
render_parses( render_parses(
docs, docs,
displacy_path, displacy_path,
@ -129,6 +143,7 @@ def evaluate(
limit=displacy_limit, limit=displacy_limit,
deps=render_deps, deps=render_deps,
ents=render_ents, ents=render_ents,
spans=render_spans,
) )
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
@ -182,6 +197,7 @@ def render_parses(
limit: int = 250, limit: int = 250,
deps: bool = True, deps: bool = True,
ents: bool = True, ents: bool = True,
spans: bool = True,
): ):
docs[0].user_data["title"] = model_name docs[0].user_data["title"] = model_name
if ents: if ents:
@ -195,6 +211,11 @@ def render_parses(
with (output_path / "parses.html").open("w", encoding="utf8") as file_: with (output_path / "parses.html").open("w", encoding="utf8") as file_:
file_.write(html) file_.write(html)
if spans:
html = displacy.render(docs[:limit], style="span", page=True)
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
file_.write(html)
def print_prf_per_type( def print_prf_per_type(
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str

View File

@ -970,9 +970,12 @@ class Errors(metaclass=ErrorsWithCodes):
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_select_port=True` to pick an available port automatically.") "or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.") E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
E1052 = ("Both 'min_length' and 'max_length' should be larger than 0, but found" E1052 = ("Unable to copy spans: the character offsets for the span at "
"index {i} in the span group do not align with the tokenization "
"in the target doc.")
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
" 'min_length': {min_length}, 'max_length': {max_length}") " 'min_length': {min_length}, 'max_length': {max_length}")
E1053 = ("The text, including whitespace, must match between reference and " E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.") "predicted docs when training {component}.")

24
spacy/lang/ms/__init__.py Normal file
View File

@ -0,0 +1,24 @@
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults
class MalayDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Malay(Language):
lang = "ms"
Defaults = MalayDefaults
__all__ = ["Malay"]

File diff suppressed because it is too large Load Diff

17
spacy/lang/ms/examples.py Normal file
View File

@ -0,0 +1,17 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.ms.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
"Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
"Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.",
"Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir",
"Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?",
"Siapa yang akan memimpin projek itu?",
"Siapa perdana menteri Malaysia sekarang?",
]

View File

@ -0,0 +1,66 @@
import unicodedata
from .punctuation import LIST_CURRENCY
from ...attrs import IS_CURRENCY, LIKE_NUM
_num_words = [
"kosong",
"satu",
"dua",
"tiga",
"empat",
"lima",
"enam",
"tujuh",
"lapan",
"sembilan",
"sepuluh",
"sebelas",
"belas",
"puluh",
"ratus",
"ribu",
"juta",
"billion",
"trillion",
"kuadrilion",
"kuintilion",
"sekstilion",
"septilion",
"oktilion",
"nonilion",
"desilion",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
return True
if text.count("-") == 1:
_, num = text.split("-")
if num.isdigit() or num in _num_words:
return True
return False
def is_currency(text):
if text in LIST_CURRENCY:
return True
for char in text:
if unicodedata.category(char) != "Sc":
return False
return True
LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}

View File

@ -0,0 +1,61 @@
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units
_units = (
_units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
"Hz kHz MHz GHz mAh "
"ratus rb ribu ribuan "
"juta jt jutaan mill?iar million bil[l]?iun bilyun billion "
)
_currency = _currency + r" USD RM MYR Rp IDR RMB SGD S\$"
_months = (
"Januari Februari Mac April Mei Jun Julai Ogos September "
"Oktober November Disember Januari Februari Mac Mei Jun "
"Julai Ogos Oktober Disember Jan Feb Mac Jun Julai Ogos Sept "
"Okt Nov Dis"
)
UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)
HTML_PREFIX = r"<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>"
HTML_SUFFIX = r"</(b|strong|i|em|p|span|div|a)>"
MONTHS = merge_chars(_months)
LIST_CURRENCY = split_chars(_currency)
_prefixes = list(TOKENIZER_PREFIXES)
_prefixes.remove("#") # hashtag
_prefixes = _prefixes + LIST_CURRENCY + [HTML_PREFIX] + ["/", ""]
_suffixes = (
TOKENIZER_SUFFIXES
+ [r"\-[Nn]ya", "-[KkMm]u", "[—-]"]
+ [
# disabled: variable width currency variable
# r"(?<={c})(?:[0-9]+)".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9])%",
# disabled: variable width HTML_SUFFIX variable
# r"(?<=[0-9{a}]{h})(?:[\.,:-])".format(a=ALPHA, h=HTML_SUFFIX),
r"(?<=[0-9{a}])(?:{h})".format(a=ALPHA, h=HTML_SUFFIX),
]
)
_infixes = TOKENIZER_INFIXES + [
r"(?<=[0-9])[\\/](?=[0-9%-])",
r"(?<=[0-9])%(?=[{a}0-9/])".format(a=ALPHA),
# disabled: variable width units variable
# r"(?<={u})[\/-](?=[0-9])".format(u=UNITS),
# disabled: variable width months variable
# r"(?<={m})[\/-](?=[0-9])".format(m=MONTHS),
r'(?<=[0-9)][.,])"(?=[0-9])',
r'(?<=[{a})][.,\'])["—](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])-(?=[0-9])".format(a=ALPHA),
r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[\/-](?={c}|[{a}])".format(a=ALPHA, c=CURRENCY),
]
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

118
spacy/lang/ms/stop_words.py Normal file
View File

@ -0,0 +1,118 @@
STOP_WORDS = set(
"""
ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
awalnya
bagai bagaikan bagaimana bagaimanakah bagaimanapun bagi bagian bahkan bahwa
bahwasanya baik bakal bakalan balik banyak bapak baru bawah beberapa begini
beginian beginikah beginilah begitu begitukah begitulah begitupun bekerja
belakang belakangan belum belumlah benar benarkah benarlah berada berakhir
berakhirlah berakhirnya berapa berapakah berapalah berapapun berarti berawal
berbagai berdatangan beri berikan berikut berikutnya berjumlah berkali-kali
berkata berkehendak berkeinginan berkenaan berlainan berlalu berlangsung
berlebihan bermacam bermacam-macam bermaksud bermula bersama bersama-sama
bersiap bersiap-siap bertanya bertanya-tanya berturut berturut-turut bertutur
berujar berupa besar betul betulkah biasa biasanya bila bilakah bisa bisakah
boleh bolehkah bolehlah buat bukan bukankah bukanlah bukannya bulan bung
cara caranya cukup cukupkah cukuplah cuma
dahulu dalam dan dapat dari daripada datang dekat demi demikian demikianlah
dengan depan di dia diakhiri diakhirinya dialah diantara diantaranya diberi
diberikan diberikannya dibuat dibuatnya didapat didatangkan digunakan
diibaratkan diibaratkannya diingat diingatkan diinginkan dijawab dijelaskan
dijelaskannya dikarenakan dikatakan dikatakannya dikerjakan diketahui
diketahuinya dikira dilakukan dilalui dilihat dimaksud dimaksudkan
dimaksudkannya dimaksudnya diminta dimintai dimisalkan dimulai dimulailah
dimulainya dimungkinkan dini dipastikan diperbuat diperbuatnya dipergunakan
diperkirakan diperlihatkan diperlukan diperlukannya dipersoalkan dipertanyakan
dipunyai diri dirinya disampaikan disebut disebutkan disebutkannya disini
disinilah ditambahkan ditandaskan ditanya ditanyai ditanyakan ditegaskan
ditujukan ditunjuk ditunjuki ditunjukkan ditunjukkannya ditunjuknya dituturkan
dituturkannya diucapkan diucapkannya diungkapkan dong dua dulu
empat enggak enggaknya entah entahlah
guna gunakan
hal hampir hanya hanyalah hari harus haruslah harusnya hendak hendaklah
hendaknya hingga
ia ialah ibarat ibaratkan ibaratnya ibu ikut ingat ingat-ingat ingin inginkah
inginkan ini inikah inilah itu itukah itulah
jadi jadilah jadinya jangan jangankan janganlah jauh jawab jawaban jawabnya
jelas jelaskan jelaslah jelasnya jika jikalau juga jumlah jumlahnya justru
kala kalau kalaulah kalaupun kalian kami kamilah kamu kamulah kan kapan
kapankah kapanpun karena karenanya kasus kata katakan katakanlah katanya ke
keadaan kebetulan kecil kedua keduanya keinginan kelamaan kelihatan
kelihatannya kelima keluar kembali kemudian kemungkinan kemungkinannya kenapa
kepada kepadanya kesampaian keseluruhan keseluruhannya keterlaluan ketika
khususnya kini kinilah kira kira-kira kiranya kita kitalah kok kurang
lagi lagian lah lain lainnya lalu lama lamanya lanjut lanjutnya lebih lewat
lima luar
macam maka makanya makin malah malahan mampu mampukah mana manakala manalagi
masa masalah masalahnya masih masihkah masing masing-masing mau maupun
melainkan melakukan melalui melihat melihatnya memang memastikan memberi
memberikan membuat memerlukan memihak meminta memintakan memisalkan memperbuat
mempergunakan memperkirakan memperlihatkan mempersiapkan mempersoalkan
mempertanyakan mempunyai memulai memungkinkan menaiki menambahkan menandaskan
menanti menanti-nanti menantikan menanya menanyai menanyakan mendapat
mendapatkan mendatang mendatangi mendatangkan menegaskan mengakhiri mengapa
mengatakan mengatakannya mengenai mengerjakan mengetahui menggunakan
menghendaki mengibaratkan mengibaratkannya mengingat mengingatkan menginginkan
mengira mengucapkan mengucapkannya mengungkapkan menjadi menjawab menjelaskan
menuju menunjuk menunjuki menunjukkan menunjuknya menurut menuturkan
menyampaikan menyangkut menyatakan menyebutkan menyeluruh menyiapkan merasa
mereka merekalah merupakan meski meskipun meyakini meyakinkan minta mirip
misal misalkan misalnya mula mulai mulailah mulanya mungkin mungkinkah
nah naik namun nanti nantinya nyaris nyatanya
oleh olehnya
pada padahal padanya pak paling panjang pantas para pasti pastilah penting
pentingnya per percuma perlu perlukah perlunya pernah persoalan pertama
pertama-tama pertanyaan pertanyakan pihak pihaknya pukul pula pun punya
rasa rasanya rata rupanya
saat saatnya saja sajalah saling sama sama-sama sambil sampai sampai-sampai
sampaikan sana sangat sangatlah satu saya sayalah se sebab sebabnya sebagai
sebagaimana sebagainya sebagian sebaik sebaik-baiknya sebaiknya sebaliknya
sebanyak sebegini sebegitu sebelum sebelumnya sebenarnya seberapa sebesar
sebetulnya sebisanya sebuah sebut sebutlah sebutnya secara secukupnya sedang
sedangkan sedemikian sedikit sedikitnya seenaknya segala segalanya segera
seharusnya sehingga seingat sejak sejauh sejenak sejumlah sekadar sekadarnya
sekali sekali-kali sekalian sekaligus sekalipun sekarang sekarang sekecil
seketika sekiranya sekitar sekitarnya sekurang-kurangnya sekurangnya sela
selain selaku selalu selama selama-lamanya selamanya selanjutnya seluruh
seluruhnya semacam semakin semampu semampunya semasa semasih semata semata-mata
semaunya sementara semisal semisalnya sempat semua semuanya semula sendiri
sendirian sendirinya seolah seolah-olah seorang sepanjang sepantasnya
sepantasnyalah seperlunya seperti sepertinya sepihak sering seringnya serta
serupa sesaat sesama sesampai sesegera sesekali seseorang sesuatu sesuatunya
sesudah sesudahnya setelah setempat setengah seterusnya setiap setiba setibanya
setidak-tidaknya setidaknya setinggi seusai sewaktu siap siapa siapakah
siapapun sini sinilah soal soalnya suatu sudah sudahkah sudahlah supaya
tadi tadinya tahu tahun tak tambah tambahnya tampak tampaknya tandas tandasnya
tanpa tanya tanyakan tanyanya tapi tegas tegasnya telah tempat tengah tentang
tentu tentulah tentunya tepat terakhir terasa terbanyak terdahulu terdapat
terdiri terhadap terhadapnya teringat teringat-ingat terjadi terjadilah
terjadinya terkira terlalu terlebih terlihat termasuk ternyata tersampaikan
tersebut tersebutlah tertentu tertuju terus terutama tetap tetapi tiap tiba
tiba-tiba tidak tidakkah tidaklah tiga tinggi toh tunjuk turut tutur tuturnya
ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
waduh wah wahai waktu waktunya walau walaupun wong
yaitu yakin yakni yang
""".split()
)

View File

@ -0,0 +1,41 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
from typing import Iterator, Optional, Any, Dict, Callable, Iterable from typing import Iterator, Optional, Any, Dict, Callable, Iterable
from typing import Union, Tuple, List, Set, Pattern, Sequence from typing import Union, Tuple, List, Set, Pattern, Sequence
from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload from typing import NoReturn, TypeVar, cast, overload
from dataclasses import dataclass from dataclasses import dataclass
import random import random
@ -1269,7 +1269,10 @@ class Language:
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples" "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
) )
doc = Doc(self.vocab, words=["x", "y", "z"]) doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})]
def get_examples():
return [Example.from_dict(doc, {})]
if not hasattr(get_examples, "__call__"): if not hasattr(get_examples, "__call__"):
err = Errors.E930.format( err = Errors.E930.format(
method="Language.initialize", obj=type(get_examples) method="Language.initialize", obj=type(get_examples)
@ -1372,6 +1375,7 @@ class Language:
scorer: Optional[Scorer] = None, scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
scorer_cfg: Optional[Dict[str, Any]] = None, scorer_cfg: Optional[Dict[str, Any]] = None,
per_component: bool = False,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Evaluate a model's pipeline components. """Evaluate a model's pipeline components.
@ -1383,6 +1387,8 @@ class Language:
arguments for specific components. arguments for specific components.
scorer_cfg (dict): An optional dictionary with extra keyword arguments scorer_cfg (dict): An optional dictionary with extra keyword arguments
for the scorer. for the scorer.
per_component (bool): Whether to return the scores keyed by component
name. Defaults to False.
RETURNS (Scorer): The scorer containing the evaluation results. RETURNS (Scorer): The scorer containing the evaluation results.
@ -1415,7 +1421,7 @@ class Language:
for eg, doc in zip(examples, docs): for eg, doc in zip(examples, docs):
eg.predicted = doc eg.predicted = doc
end_time = timer() end_time = timer()
results = scorer.score(examples) results = scorer.score(examples, per_component=per_component)
n_words = sum(len(eg.predicted) for eg in examples) n_words = sum(len(eg.predicted) for eg in examples)
results["speed"] = n_words / (end_time - start_time) results["speed"] = n_words / (end_time - start_time)
return results return results

View File

@ -1,4 +1,4 @@
from typing import Tuple, Callable from typing import List, Tuple, Callable
from thinc.api import Model, to_numpy from thinc.api import Model, to_numpy
from thinc.types import Ragged, Ints1d from thinc.types import Ragged, Ints1d
@ -52,14 +52,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
indices will be [5, 6, 7, 8, 8, 9]. indices will be [5, 6, 7, 8, 8, 9].
""" """
spans, lengths = _ensure_cpu(spans, lengths) spans, lengths = _ensure_cpu(spans, lengths)
indices = [] indices: List[int] = []
offset = 0 offset = 0
for i, length in enumerate(lengths): for i, length in enumerate(lengths):
spans_i = spans[i].dataXd + offset spans_i = spans[i].dataXd + offset
for j in range(spans_i.shape[0]): for j in range(spans_i.shape[0]):
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index] indices.extend(range(spans_i[j, 0], spans_i[j, 1])) # type: ignore[arg-type, call-overload]
offset += length offset += length
return ops.flatten(indices, dtype="i", ndim_if_empty=1) return ops.asarray1i(indices)
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]: def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:

View File

@ -173,7 +173,7 @@ class SpanFinder(TrainablePipe):
min_length = 1 min_length = 1
if max_length < 1 or min_length < 1: if max_length < 1 or min_length < 1:
raise ValueError( raise ValueError(
Errors.E1052.format(min_length=min_length, max_length=max_length) Errors.E1053.format(min_length=min_length, max_length=max_length)
) )
self.min_length = min_length self.min_length = min_length
self.max_length = max_length self.max_length = max_length
@ -267,7 +267,7 @@ class SpanFinder(TrainablePipe):
masks = [] masks = []
for eg in examples: for eg in examples:
if eg.x.text != eg.y.text: if eg.x.text != eg.y.text:
raise ValueError(Errors.E1053.format(component="span_finder")) raise ValueError(Errors.E1054.format(component="span_finder"))
n_tokens = len(eg.predicted) n_tokens = len(eg.predicted)
truth = ops.xp.zeros((n_tokens, 2), dtype="float32") truth = ops.xp.zeros((n_tokens, 2), dtype="float32")
mask = ops.xp.ones((n_tokens, 2), dtype="float32") mask = ops.xp.ones((n_tokens, 2), dtype="float32")

View File

@ -121,20 +121,30 @@ class Scorer:
nlp.add_pipe(pipe) nlp.add_pipe(pipe)
self.nlp = nlp self.nlp = nlp
def score(self, examples: Iterable[Example]) -> Dict[str, Any]: def score(
self, examples: Iterable[Example], *, per_component: bool = False
) -> Dict[str, Any]:
"""Evaluate a list of Examples. """Evaluate a list of Examples.
examples (Iterable[Example]): The predicted annotations + correct annotations. examples (Iterable[Example]): The predicted annotations + correct annotations.
per_component (bool): Whether to return the scores keyed by component
name. Defaults to False.
RETURNS (Dict): A dictionary of scores. RETURNS (Dict): A dictionary of scores.
DOCS: https://spacy.io/api/scorer#score DOCS: https://spacy.io/api/scorer#score
""" """
scores = {} scores = {}
if hasattr(self.nlp.tokenizer, "score"): if hasattr(self.nlp.tokenizer, "score"):
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore if per_component:
scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg)
else:
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
for name, component in self.nlp.pipeline: for name, component in self.nlp.pipeline:
if hasattr(component, "score"): if hasattr(component, "score"):
scores.update(component.score(examples, **self.cfg)) if per_component:
scores[name] = component.score(examples, **self.cfg)
else:
scores.update(component.score(examples, **self.cfg))
return scores return scores
@staticmethod @staticmethod

View File

@ -291,6 +291,11 @@ def ml_tokenizer():
return get_lang_class("ml")().tokenizer return get_lang_class("ml")().tokenizer
@pytest.fixture(scope="session")
def ms_tokenizer():
return get_lang_class("ms")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def nb_tokenizer(): def nb_tokenizer():
return get_lang_class("nb")().tokenizer return get_lang_class("nb")().tokenizer

View File

@ -93,6 +93,21 @@ def test_span_group_copy(doc):
assert span_group.attrs["key"] == "value" assert span_group.attrs["key"] == "value"
assert list(span_group) != list(clone) assert list(span_group) != list(clone)
# can't copy if the character offsets don't align to tokens
doc2 = Doc(doc.vocab, words=[t.text + "x" for t in doc])
with pytest.raises(ValueError):
span_group.copy(doc=doc2)
# can copy with valid character offsets despite different tokenization
doc3 = doc.copy()
with doc3.retokenize() as retokenizer:
retokenizer.merge(doc3[0:2])
retokenizer.merge(doc3[3:6])
span_group = SpanGroup(doc, spans=[doc[0:6], doc[3:6]])
for span1, span2 in zip(span_group, span_group.copy(doc=doc3)):
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
def test_span_group_set_item(doc, other_doc): def test_span_group_set_item(doc, other_doc):
span_group = doc.spans["SPANS"] span_group = doc.spans["SPANS"]
@ -253,3 +268,12 @@ def test_span_group_typing(doc: Doc):
for i, span in enumerate(span_group): for i, span in enumerate(span_group):
assert span == span_group[i] == spans[i] assert span == span_group[i] == spans[i]
filter_spans(span_group) filter_spans(span_group)
def test_span_group_init_doc(en_tokenizer):
"""Test that all spans must come from the specified doc."""
doc1 = en_tokenizer("a b c")
doc2 = en_tokenizer("a b c")
span_group = SpanGroup(doc1, spans=[doc1[0:1], doc1[1:2]])
with pytest.raises(ValueError):
span_group = SpanGroup(doc1, spans=[doc1[0:1], doc2[1:2]])

View File

View File

@ -0,0 +1,8 @@
import pytest
def test_noun_chunks_is_parsed_ms(ms_tokenizer):
"""Test that noun_chunks raises Value Error for 'ms' language if Doc is not parsed."""
doc = ms_tokenizer("sebelas")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -0,0 +1,112 @@
import pytest
@pytest.mark.parametrize("text", ["(Ma'arif)"])
def test_ms_tokenizer_splits_no_special(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize("text", ["Ma'arif"])
def test_ms_tokenizer_splits_no_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize("text", ["(Ma'arif"])
def test_ms_tokenizer_splits_prefix_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize("text", ["Ma'arif)"])
def test_ms_tokenizer_splits_suffix_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize("text", ["(Ma'arif)"])
def test_ms_tokenizer_splits_even_wrap(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize("text", ["(Ma'arif?)"])
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
def test_ms_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
tokens = id_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize("text", ["S.Kom.)"])
def test_ms_tokenizer_splits_suffix_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize("text", ["(S.Kom.)"])
def test_ms_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize("text", ["(S.Kom.?)"])
def test_ms_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize(
"text,length",
[("kerana", 1), ("Mahathir-Anwar", 3), ("Tun Dr. Ismail-Abdul Rahman", 6)],
)
def test_my_tokenizer_splits_hyphens(ms_tokenizer, text, length):
tokens = ms_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_ms_tokenizer_splits_numeric_range(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize("text", ["ini.Sani", "Halo.Malaysia"])
def test_ms_tokenizer_splits_period_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize("text", ["Halo,Malaysia", "satu,dua"])
def test_ms_tokenizer_splits_comma_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
assert tokens[0].text == text.split(",")[0]
assert tokens[1].text == ","
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize("text", ["halo...Malaysia", "dia...pergi"])
def test_ms_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
def test_ms_tokenizer_splits_double_hyphen_infix(id_tokenizer):
tokens = id_tokenizer("Arsene Wenger--pengurus Arsenal--mengadakan sidang media.")
assert len(tokens) == 10
assert tokens[0].text == "Arsene"
assert tokens[1].text == "Wenger"
assert tokens[2].text == "--"
assert tokens[3].text == "pengurus"
assert tokens[4].text == "Arsenal"
assert tokens[5].text == "--"
assert tokens[6].text == "mengadakan"
assert tokens[7].text == "sidang"
assert tokens[8].text == "media"
assert tokens[9].text == "."

View File

@ -0,0 +1,8 @@
import pytest
from spacy.lang.ms.lex_attrs import like_num
@pytest.mark.parametrize("word", ["sebelas"])
def test_ms_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -728,9 +728,9 @@ def test_neg_annotation(neg_key):
ner.add_label("ORG") ner.add_label("ORG")
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
example.reference.spans[neg_key] = [ example.reference.spans[neg_key] = [
Span(neg_doc, 2, 4, "ORG"), Span(example.reference, 2, 4, "ORG"),
Span(neg_doc, 2, 3, "PERSON"), Span(example.reference, 2, 3, "PERSON"),
Span(neg_doc, 1, 4, "PERSON"), Span(example.reference, 1, 4, "PERSON"),
] ]
optimizer = nlp.initialize() optimizer = nlp.initialize()
@ -755,7 +755,7 @@ def test_neg_annotation_conflict(neg_key):
ner.add_label("PERSON") ner.add_label("PERSON")
ner.add_label("LOC") ner.add_label("LOC")
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")] example.reference.spans[neg_key] = [Span(example.reference, 2, 4, "PERSON")]
assert len(example.reference.ents) == 1 assert len(example.reference.ents) == 1
assert example.reference.ents[0].text == "Shaka Khan" assert example.reference.ents[0].text == "Shaka Khan"
assert example.reference.ents[0].label_ == "PERSON" assert example.reference.ents[0].label_ == "PERSON"
@ -788,7 +788,7 @@ def test_beam_valid_parse(neg_key):
doc = Doc(nlp.vocab, words=tokens) doc = Doc(nlp.vocab, words=tokens)
example = Example.from_dict(doc, {"ner": iob}) example = Example.from_dict(doc, {"ner": iob})
neg_span = Span(doc, 50, 53, "ORG") neg_span = Span(example.reference, 50, 53, "ORG")
example.reference.spans[neg_key] = [neg_span] example.reference.spans[neg_key] = [neg_span]
optimizer = nlp.initialize() optimizer = nlp.initialize()

View File

@ -72,7 +72,7 @@ def entity_linker():
def create_kb(vocab): def create_kb(vocab):
kb = InMemoryLookupKB(vocab, entity_vector_length=1) kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) kb.add_entity("test", 0.0, zeros((1,), dtype="f"))
return kb return kb
entity_linker = nlp.add_pipe("entity_linker") entity_linker = nlp.add_pipe("entity_linker")

View File

@ -12,6 +12,7 @@ import srsly
from click import NoSuchOption from click import NoSuchOption
from packaging.specifiers import SpecifierSet from packaging.specifiers import SpecifierSet
from thinc.api import Config, ConfigValidationError from thinc.api import Config, ConfigValidationError
from spacy.tokens import DocBin
from spacy import about from spacy import about
from spacy.cli import info from spacy.cli import info
@ -27,6 +28,7 @@ from spacy.cli.debug_data import _get_span_characteristics
from spacy.cli.debug_data import _print_span_characteristics from spacy.cli.debug_data import _print_span_characteristics
from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.debug_data import _get_spans_length_freq_dist
from spacy.cli.download import get_compatibility, get_version from spacy.cli.download import get_compatibility, get_version
from spacy.cli.evaluate import render_parses
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.init_pipeline import _init_labels from spacy.cli.init_pipeline import _init_labels
from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import get_third_party_dependencies
@ -144,6 +146,70 @@ def test_issue11235():
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
@pytest.mark.issue(12566)
@pytest.mark.parametrize(
"factory,output_file",
[("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")],
)
def test_issue12566(factory: str, output_file: str):
"""
Test if all displaCy types (ents, dep, spans) produce an HTML file
"""
with make_tempdir() as tmp_dir:
# Create sample spaCy file
doc_json = {
"ents": [
{"end": 54, "label": "nam_adj_country", "start": 44},
{"end": 83, "label": "nam_liv_person", "start": 69},
{"end": 100, "label": "nam_pro_title_book", "start": 86},
],
"spans": {
"sc": [
{"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44},
{"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69},
{
"end": 100,
"kb_id": "",
"label": "nam_pro_title_book",
"start": 86,
},
]
},
"text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , "
"Briana McNaira - Cultural Chaos .",
"tokens": [
# fmt: off
{"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, },
{"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, },
{"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, },
{"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, },
{"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, },
{"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, },
{"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, },
{"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, },
{"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, },
{"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, },
{"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, },
{"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, },
{"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, },
{"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, },
{"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, },
# fmt: on
],
}
# Create a .spacy file
nlp = spacy.blank("pl")
doc = Doc(nlp.vocab).from_json(doc_json)
# Run the evaluate command and check if the html files exist
render_parses(
docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True}
)
assert (tmp_dir / output_file).is_file()
def test_cli_info(): def test_cli_info():
nlp = Dutch() nlp = Dutch()
nlp.add_pipe("textcat") nlp.add_pipe("textcat")

View File

@ -115,6 +115,14 @@ def test_tokenization(sented_doc):
assert scores["token_r"] == approx(0.33333333) assert scores["token_r"] == approx(0.33333333)
assert scores["token_f"] == 0.4 assert scores["token_f"] == 0.4
# per-component scoring
scorer = Scorer()
scores = scorer.score([example], per_component=True)
assert scores["tokenizer"]["token_acc"] == 0.5
assert scores["tokenizer"]["token_p"] == 0.5
assert scores["tokenizer"]["token_r"] == approx(0.33333333)
assert scores["tokenizer"]["token_f"] == 0.4
def test_sents(sented_doc): def test_sents(sented_doc):
scorer = Scorer() scorer = Scorer()
@ -278,6 +286,13 @@ def test_tag_score(tagged_doc):
assert results["morph_per_feat"]["Poss"]["f"] == 0.0 assert results["morph_per_feat"]["Poss"]["f"] == 0.0
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
# per-component scoring
scorer = Scorer()
results = scorer.score([example], per_component=True)
assert results["tagger"]["tag_acc"] == 0.9
assert results["morphologizer"]["pos_acc"] == 0.9
assert results["morphologizer"]["morph_acc"] == approx(0.8)
def test_partial_annotation(en_tokenizer): def test_partial_annotation(en_tokenizer):
pred_doc = en_tokenizer("a b c d e") pred_doc = en_tokenizer("a b c d e")
@ -423,14 +438,14 @@ def test_score_spans():
return doc.spans[span_key] return doc.spans[span_key]
# Predict exactly the same, but overlapping spans will be discarded # Predict exactly the same, but overlapping spans will be discarded
pred.spans[key] = spans pred.spans[key] = gold.spans[key].copy(doc=pred)
eg = Example(pred, gold) eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter) scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
assert scores[f"{key}_p"] == 1.0 assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] < 1.0 assert scores[f"{key}_r"] < 1.0
# Allow overlapping, now both precision and recall should be 100% # Allow overlapping, now both precision and recall should be 100%
pred.spans[key] = spans pred.spans[key] = gold.spans[key].copy(doc=pred)
eg = Example(pred, gold) eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True) scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
assert scores[f"{key}_p"] == 1.0 assert scores[f"{key}_p"] == 1.0

View File

@ -1264,12 +1264,14 @@ cdef class Doc:
other.user_span_hooks = dict(self.user_span_hooks) other.user_span_hooks = dict(self.user_span_hooks)
other.length = self.length other.length = self.length
other.max_length = self.max_length other.max_length = self.max_length
other.spans = self.spans.copy(doc=other)
buff_size = other.max_length + (PADDING*2) buff_size = other.max_length + (PADDING*2)
assert buff_size > 0 assert buff_size > 0
tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC)) tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC)) memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
other.c = &tokens[PADDING] other.c = &tokens[PADDING]
# copy spans after setting tokens so that SpanGroup.copy can verify
# that the start/end offsets are valid
other.spans = self.spans.copy(doc=other)
return other return other
def to_disk(self, path, *, exclude=tuple()): def to_disk(self, path, *, exclude=tuple()):

View File

@ -1,10 +1,12 @@
from typing import Callable, Protocol, Iterator, Optional, Union, Tuple, Any, overload from typing import Any, Callable, Iterator, Optional, Protocol, Tuple, Union, overload
from thinc.types import Floats1d, Ints2d, FloatsXd
from thinc.types import Floats1d, FloatsXd, Ints2d
from ..lexeme import Lexeme
from ..vocab import Vocab
from .doc import Doc from .doc import Doc
from .token import Token from .token import Token
from .underscore import Underscore from .underscore import Underscore
from ..lexeme import Lexeme
from ..vocab import Vocab
class SpanMethod(Protocol): class SpanMethod(Protocol):
def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@ -51,7 +53,12 @@ class Span:
kb_id: Union[str, int] = ..., kb_id: Union[str, int] = ...,
span_id: Union[str, int] = ..., span_id: Union[str, int] = ...,
) -> None: ... ) -> None: ...
def __richcmp__(self, other: Span, op: int) -> bool: ... def __lt__(self, other: Any) -> bool: ...
def __le__(self, other: Any) -> bool: ...
def __eq__(self, other: Any) -> bool: ...
def __ne__(self, other: Any) -> bool: ...
def __gt__(self, other: Any) -> bool: ...
def __ge__(self, other: Any) -> bool: ...
def __hash__(self) -> int: ... def __hash__(self) -> int: ...
def __len__(self) -> int: ... def __len__(self) -> int: ...
def __repr__(self) -> str: ... def __repr__(self) -> str: ...

View File

@ -1,4 +1,5 @@
from typing import Any, Dict, Iterable, Optional from typing import Any, Dict, Iterable, Iterator, Optional
from .doc import Doc from .doc import Doc
from .span import Span from .span import Span
@ -18,7 +19,7 @@ class SpanGroup:
def doc(self) -> Doc: ... def doc(self) -> Doc: ...
@property @property
def has_overlap(self) -> bool: ... def has_overlap(self) -> bool: ...
def __iter__(self): ... def __iter__(self) -> Iterator[Span]: ...
def __len__(self) -> int: ... def __len__(self) -> int: ...
def append(self, span: Span) -> None: ... def append(self, span: Span) -> None: ...
def extend(self, spans: Iterable[Span]) -> None: ... def extend(self, spans: Iterable[Span]) -> None: ...

View File

@ -52,6 +52,8 @@ cdef class SpanGroup:
if len(spans) : if len(spans) :
self.c.reserve(len(spans)) self.c.reserve(len(spans))
for span in spans: for span in spans:
if doc is not span.doc:
raise ValueError(Errors.E855.format(obj="span"))
self.push_back(span.c) self.push_back(span.c)
def __repr__(self): def __repr__(self):
@ -261,11 +263,22 @@ cdef class SpanGroup:
""" """
if doc is None: if doc is None:
doc = self.doc doc = self.doc
if doc is self.doc:
spans = list(self)
else:
spans = [doc.char_span(span.start_char, span.end_char, label=span.label_, kb_id=span.kb_id, span_id=span.id) for span in self]
for i, span in enumerate(spans):
if span is None:
raise ValueError(Errors.E1052.format(i=i))
if span.kb_id in self.doc.vocab.strings:
doc.vocab.strings.add(span.kb_id_)
if span.id in span.doc.vocab.strings:
doc.vocab.strings.add(span.id_)
return SpanGroup( return SpanGroup(
doc, doc,
name=self.name, name=self.name,
attrs=deepcopy(self.attrs), attrs=deepcopy(self.attrs),
spans=list(self), spans=spans,
) )
def _concat( def _concat(

View File

@ -133,10 +133,11 @@ def init_vocab(
logger.info("Added vectors: %s", vectors) logger.info("Added vectors: %s", vectors)
# warn if source model vectors are not identical # warn if source model vectors are not identical
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) if len(sourced_vectors_hashes) > 0:
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
if vectors_hash != sourced_vectors_hash: for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
warnings.warn(Warnings.W113.format(name=sourced_component)) if vectors_hash != sourced_vectors_hash:
warnings.warn(Warnings.W113.format(name=sourced_component))
logger.info("Finished initializing nlp object") logger.info("Finished initializing nlp object")

View File

@ -1,11 +1,13 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
from .compat import Protocol, runtime_checkable from .compat import Protocol, runtime_checkable
from thinc.api import Optimizer, Model from thinc.api import Optimizer, Model
if TYPE_CHECKING: if TYPE_CHECKING:
from .training import Example from .training import Example
from .language import Language
@runtime_checkable @runtime_checkable
@ -32,7 +34,7 @@ class InitializableComponent(Protocol):
def initialize( def initialize(
self, self,
get_examples: Callable[[], Iterable["Example"]], get_examples: Callable[[], Iterable["Example"]],
nlp: Iterable["Example"], nlp: "Language",
**kwargs: Any **kwargs: Any
): ):
... ...

View File

@ -1163,18 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the
$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
``` ```
| Name | Description | | Name | Description |
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | | `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | | `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | | `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | | `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | | `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | | `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | | `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ |
| **CREATES** | Training results and optional metrics and visualizations. | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Training results and optional metrics and visualizations. |
### speed {id="benchmark-speed", version="3.5", tag="command"} ### speed {id="benchmark-speed", version="3.5", tag="command"}
@ -1220,7 +1221,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ | | `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ | | `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ | | `output-file` | Output `DocBin` path. ~~str (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ | | `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ | | `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
@ -1640,7 +1641,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details,
see the spaCy project [integration](/usage/projects#huggingface_hub). see the spaCy project [integration](/usage/projects#huggingface_hub).
```bash ```bash
$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose] $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
``` ```
> #### Example > #### Example
@ -1654,6 +1655,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo]
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | | `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | | `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | | `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ |
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ | | `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
| **UPLOADS** | The pipeline to the hub. | | **UPLOADS** | The pipeline to the hub. |

View File

@ -64,7 +64,7 @@ architectures and their arguments and hyperparameters.
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | | `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | | `generate_empty_kb` <Tag variant="new">3.5.1</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | | `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |

View File

@ -292,7 +292,7 @@ Restore the state of the knowledge base from a given directory. Note that the
> ```python > ```python
> from spacy.vocab import Vocab > from spacy.vocab import Vocab
> vocab = Vocab().from_disk("/path/to/vocab") > vocab = Vocab().from_disk("/path/to/vocab")
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64) > kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
> kb.from_disk("/path/to/kb") > kb.from_disk("/path/to/kb")
> ``` > ```

View File

@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
> print(scores) > print(scores)
> ``` > ```
| Name | Description | | Name | Description |
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `batch_size` | The batch size to use. ~~Optional[int]~~ | | `batch_size` | The batch size to use. ~~Optional[int]~~ |
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ | | `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ | | `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Language.use_params {id="use_params",tag="contextmanager, method"} ## Language.use_params {id="use_params",tag="contextmanager, method"}

View File

@ -213,11 +213,11 @@ Retrieve values for a feature by field.
> assert morph.get("Feat1") == ["Val1", "Val2"] > assert morph.get("Feat1") == ["Val1", "Val2"]
> ``` > ```
| Name | Description | | Name | Description |
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ | | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `field` | The field to retrieve. ~~str~~ | | `field` | The field to retrieve. ~~str~~ |
| `default` <Tag variant="new">3.6</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ | | `default` <Tag variant="new">3.5.3</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
| **RETURNS** | A list of the individual features. ~~List[str]~~ | | **RETURNS** | A list of the individual features. ~~List[str]~~ |
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"} ### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}

View File

@ -33,7 +33,7 @@ Create a new `Scorer`.
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ | | `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ | | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | | `**kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
## Scorer.score {id="score",tag="method"} ## Scorer.score {id="score",tag="method"}
@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or
> scores = scorer.score(examples) > scores = scorer.score(examples)
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------- | | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | _keyword-only_ | |
| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"} ## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"}

View File

@ -469,7 +469,7 @@ factories.
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). | | `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. | | `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`. |
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
### spacy-transformers registry {id="registry-transformers"} ### spacy-transformers registry {id="registry-transformers"}

View File

@ -259,6 +259,26 @@ source code and recompiling frequently.
$ python setup.py develop $ python setup.py develop
``` ```
#### Visual Studio Code extension
![spaCy extension demo](/images/spacy-extension-demo.gif)
The [spaCy VSCode Extension](https://github.com/explosion/spacy-vscode) provides
additional tooling and features for working with spaCy's config files. Version
1.0.0 includes hover descriptions for registry functions, variables, and section
names within the config as an installable extension.
1. Install a supported version of Python on your system (`>=3.7`)
2. Install the
[Python Extension for Visual Studio Code](https://code.visualstudio.com/docs/python/python-tutorial)
3. Create a
[virtual python environment](https://docs.python.org/3/library/venv.html)
4. Install all python requirements (`spaCy >= 3.4.0` & `pygls >= 1.0.0`)
5. Install
[spaCy extension for Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=Explosion.spacy-extension)
6. Select your python environment
7. You are ready to work with `.cfg` files in spaCy!
### Building an executable {id="executable"} ### Building an executable {id="executable"}
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that

View File

@ -56,14 +56,19 @@ wrap. So if you come across this problem, especially when using custom labels,
you'll have to increase the `distance` setting in the `options` to allow longer you'll have to increase the `distance` setting in the `options` to allow longer
arcs. arcs.
Moreover, you might need to modify the `offset_x` argument depending on the shape
of your document. Otherwise, the left part of the document may overflow beyond the
container's border.
</Infobox> </Infobox>
| Argument | Description | | Argument | Description |
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | | `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | | `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | | `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
| `offset_x` | Spacing on left side of the SVG in px. You might need to tweak this setting for long texts. Defaults to `50`. ~~int~~ |
For a list of all available options, see the For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options). [`displacy` API documentation](/api/top-level#displacy_options).

View File

@ -264,6 +264,11 @@
"code": "mr", "code": "mr",
"name": "Marathi" "name": "Marathi"
}, },
{
"code": "ms",
"name": "Malay",
"has_examples": true
},
{ {
"code": "nb", "code": "nb",
"name": "Norwegian Bokmål", "name": "Norwegian Bokmål",

View File

@ -1,5 +1,72 @@
{ {
"resources": [ "resources": [
{
"id": "spacy-vscode",
"title": "spaCy Visual Studio Code Extension",
"thumb": "https://raw.githubusercontent.com/explosion/spacy-vscode/main/icon.png",
"slogan": "Work with spaCy's config files in VS Code",
"description": "The spaCy VS Code Extension provides additional tooling and features for working with spaCy's config files. Version 1.0.0 includes hover descriptions for registry functions, variables, and section names within the config as an installable extension.",
"url": "https://marketplace.visualstudio.com/items?itemName=Explosion.spacy-extension",
"github": "explosion/spacy-vscode",
"code_language": "python",
"author": "Explosion",
"author_links": {
"twitter": "@explosion_ai",
"github": "explosion"
},
"category": ["extension"],
"tags": []
},
{
"id": "parsigs",
"title": "parsigs",
"slogan": "Structuring prescriptions text made simple using spaCy",
"description": "Parsigs is an open-source project that aims to extract the relevant dosage information from prescriptions text without compromising the patient's privacy.\n\nNotice you also need to install the model in order to use the package: `pip install https://huggingface.co/royashcenazi/en_parsigs/resolve/main/en_parsigs-any-py3-none-any.whl`",
"github": "royashcenazi/parsigs",
"pip": "parsigs",
"code_language": "python",
"author": "Roy Ashcenazi",
"code_example": [
"# You'll need to install the trained model, see instructions in the description section",
"from parsigs.parse_sig_api import StructuredSig, SigParser",
"sig_parser = SigParser()",
"",
"sig = 'Take 1 tablet of ibuprofen 200mg 3 times every day for 3 weeks'",
"parsed_sig = sig_parser.parse(sig)"
],
"author_links": {
"github": "royashcenazi"
},
"category": ["model", "research", "biomedical"],
"tags": ["sigs", "prescription","pharma"]
},
{
"id": "latincy",
"title": "LatinCy",
"thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png",
"slogan": "Synthetic trained spaCy pipelines for Latin NLP",
"description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.",
"url": "https://huggingface.co/latincy",
"code_example": [
"# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
"import spacy",
"nlp = spacy.load('la_core_web_lg')",
"doc = nlp('Haec narrantur a poetis de Perseo')",
"",
"print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
"",
"# > Haec, haec, hic, DET"
],
"code_language": "python",
"author": "Patrick J. Burns",
"author_links": {
"twitter": "@diyclassics",
"github": "diyclassics",
"website": "https://diyclassics.github.io/"
},
"category": ["pipeline", "research"],
"tags": ["latin"]
},
{ {
"id": "spacy-wasm", "id": "spacy-wasm",
"title": "spacy-wasm", "title": "spacy-wasm",
@ -334,7 +401,7 @@
}, },
{ {
"id": "spacypdfreader", "id": "spacypdfreader",
"title": "spadypdfreader", "title": "spacypdfreader",
"category": ["pipeline"], "category": ["pipeline"],
"tags": ["PDF"], "tags": ["PDF"],
"slogan": "Easy PDF to text to spaCy text extraction in Python.", "slogan": "Easy PDF to text to spaCy text extraction in Python.",
@ -351,7 +418,7 @@
}, },
"code_example": [ "code_example": [
"import spacy", "import spacy",
"from spacypdfreader import pdf_reader", "from spacypdfreader.spacypdfreader import pdf_reader",
"", "",
"nlp = spacy.load('en_core_web_sm')", "nlp = spacy.load('en_core_web_sm')",
"doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)", "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
@ -2810,6 +2877,58 @@
"tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"], "tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"],
"spacy_version": 3 "spacy_version": 3
}, },
{
"id": "adeptaugmentations",
"title": "Adept Augmentations",
"slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.",
"description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".",
"github": "argilla-io/adept-augmentations",
"pip": "adept-augmentations",
"thumb": "https://raw.githubusercontent.com/argilla-io/adept-augmentations/main/logo.png",
"code_example": [
"from adept_augmentations import EntitySwapAugmenter",
"import spacy",
"from spacy.tokens import Doc, DocBin",
"nlp = spacy.blank(\"en\")",
"",
"# Create some example golden data",
"example_data = [",
" (\"Apple is looking at buying U.K. startup for $1 billion\", [(0, 5, \"ORG\"), (27, 31, \"LOC\"), (44, 54, \"MONEY\")]),",
" (\"Microsoft acquires GitHub for $7.5 billion\", [(0, 9, \"ORG\"), (19, 25, \"ORG\"), (30, 42, \"MONEY\")]),",
"]",
"",
"# Create a new DocBin",
"nlp = spacy.blank(\"en\")",
"docs = []",
"for entry in example_data:",
" doc = Doc(nlp.vocab, words=entry[0].split())",
" doc.ents = [doc.char_span(ent[0], ent[1], label=ent[2]) for ent in entry[1]]",
" docs.append(doc)",
"golden_dataset = DocBin(docs=docs)",
"",
"# Augment Data",
"augmented_dataset = EntitySwapAugmenter(golden_dataset).augment(4)",
"for doc in augmented_dataset.get_docs(nlp.vocab):",
" print(doc.text)",
"",
"# GitHub is looking at buying U.K. startup for $ 7.5 billion",
"# Microsoft is looking at buying U.K. startup for $ 1 billion",
"# Microsoft is looking at buying U.K. startup for $ 7.5 billion",
"# GitHub is looking at buying U.K. startup for $ 1 billion",
"# Microsoft acquires Apple for $ 7.5 billion",
"# Apple acquires Microsoft for $ 1 billion",
"# Microsoft acquires Microsoft for $ 7.5 billion",
"# GitHub acquires GitHub for $ 1 billion"
],
"author": "David Berenstein",
"author_links": {
"github": "davidberenstein1957",
"website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
},
"category": ["standalone"],
"tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
"spacy_version": 3
},
{ {
"id": "blackstone", "id": "blackstone",
"title": "Blackstone", "title": "Blackstone",
@ -4162,6 +4281,37 @@
}, },
"category": ["pipeline", "research"], "category": ["pipeline", "research"],
"tags": ["Thai"] "tags": ["Thai"]
},
{
"id": "vetiver",
"title": "Vetiver",
"slogan": "Version, share, deploy, and monitor models.",
"description": "The goal of vetiver is to provide fluent tooling to version, deploy, and monitor a trained model. Functions handle creating model objects, versioning models, predicting from a remote API endpoint, deploying Dockerfiles, and more.",
"github": "rstudio/vetiver-python",
"pip": "vetiver",
"code_example": [
"import spacy",
"from vetiver import VetiverModel, VetiverAPI",
"",
"# If you use this model, you'll need to download it first:",
"# python -m spacy download en_core_web_md",
"nlp = spacy.load('en_core_web_md')",
"# Create deployable model object with your nlp Language object",
"v = VetiverModel(nlp, model_name = 'my_model')",
"# Try out your API endpoint locally",
"VetiverAPI(v).run()"
],
"code_language": "python",
"url": "https://vetiver.rstudio.com/",
"thumb": "https://raw.githubusercontent.com/rstudio/vetiver-python/main/docs/figures/square-logo.svg",
"author": "Posit, PBC",
"author_links": {
"twitter": "posit_pbc",
"github": "rstudio",
"website": "https://posit.co/"
},
"category": ["apis", "standalone"],
"tags": ["apis", "deployment"]
} }
], ],

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.4 MiB