diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 21b660989..619570090 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -107,22 +107,22 @@ jobs:
- name: Test import
run: python -W error -c "import spacy"
- - name: "Test download CLI"
- run: |
- python -m spacy download ca_core_news_sm
- python -m spacy download ca_core_news_md
- python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
- if: matrix.python_version == '3.9'
-
- - name: "Test download_url in info CLI"
- run: |
- python -W error -m spacy info ca_core_news_sm | grep -q download_url
- if: matrix.python_version == '3.9'
-
- - name: "Test no warnings on load (#11713)"
- run: |
- python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
- if: matrix.python_version == '3.9'
+# - name: "Test download CLI"
+# run: |
+# python -m spacy download ca_core_news_sm
+# python -m spacy download ca_core_news_md
+# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+# if: matrix.python_version == '3.9'
+#
+# - name: "Test download_url in info CLI"
+# run: |
+# python -W error -m spacy info ca_core_news_sm | grep -q download_url
+# if: matrix.python_version == '3.9'
+#
+# - name: "Test no warnings on load (#11713)"
+# run: |
+# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+# if: matrix.python_version == '3.9'
- name: "Test convert CLI"
run: |
@@ -146,17 +146,17 @@ jobs:
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
if: matrix.python_version == '3.9'
- - name: "Test assemble CLI"
- run: |
- python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
- PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
- if: matrix.python_version == '3.9'
-
- - name: "Test assemble CLI vectors warning"
- run: |
- python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
- python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
- if: matrix.python_version == '3.9'
+# - name: "Test assemble CLI"
+# run: |
+# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+# if: matrix.python_version == '3.9'
+#
+# - name: "Test assemble CLI vectors warning"
+# run: |
+# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+# if: matrix.python_version == '3.9'
- name: "Install test requirements"
run: |
diff --git a/README.md b/README.md
index 36a015caf..59d3ee9ee 100644
--- a/README.md
+++ b/README.md
@@ -35,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos
## 📖 Documentation
-| Documentation | |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
-| 📚 **[Usage Guides]** | How to use spaCy and its features. |
-| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
-| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
-| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
-| 📦 **[Models]** | Download trained pipelines for spaCy. |
-| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
-| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
-| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
-| 🛠 **[Changelog]** | Changes and version history. |
-| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
+| Documentation | |
+| ----------------------------- | ---------------------------------------------------------------------- |
+| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
+| 📚 **[Usage Guides]** | How to use spaCy and its features. |
+| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
+| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
+| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
+| 📦 **[Models]** | Download trained pipelines for spaCy. |
+| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
+| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
+| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
+| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
+| 🛠 **[Changelog]** | Changes and version history. |
+| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
| | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
@@ -57,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
[api reference]: https://spacy.io/api/
[models]: https://spacy.io/models
[universe]: https://spacy.io/universe
+[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io
[project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
-
## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
diff --git a/requirements.txt b/requirements.txt
index 63e03d558..b979929c5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.8.0
+typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
# Third party dependencies
diff --git a/setup.cfg b/setup.cfg
index eea557337..45734888f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -52,7 +52,7 @@ install_requires =
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
- typer>=0.3.0,<0.8.0
+ typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
diff --git a/spacy/about.py b/spacy/about.py
index 640e9e93b..7c0a59b4e 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.5.0"
+__version__ = "3.6.0.dev1"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0c9a32b93..df4bca53d 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -81,11 +81,8 @@ def download(
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}"
- egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
- if sdist:
- filename += egg_tpl.format(m=model_name, v=version)
return filename
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 363c02cd3..9fcdd18be 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -27,6 +27,7 @@ def evaluate_cli(
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
+ per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
# fmt: on
):
"""
@@ -50,6 +51,7 @@ def evaluate_cli(
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
+ per_component=per_component,
silent=False,
)
@@ -64,6 +66,7 @@ def evaluate(
displacy_limit: int = 25,
silent: bool = True,
spans_key: str = "sc",
+ per_component: bool = False,
) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed()
@@ -78,44 +81,53 @@ def evaluate(
corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model)
dev_dataset = list(corpus(nlp))
- scores = nlp.evaluate(dev_dataset)
- metrics = {
- "TOK": "token_acc",
- "TAG": "tag_acc",
- "POS": "pos_acc",
- "MORPH": "morph_acc",
- "LEMMA": "lemma_acc",
- "UAS": "dep_uas",
- "LAS": "dep_las",
- "NER P": "ents_p",
- "NER R": "ents_r",
- "NER F": "ents_f",
- "TEXTCAT": "cats_score",
- "SENT P": "sents_p",
- "SENT R": "sents_r",
- "SENT F": "sents_f",
- "SPAN P": f"spans_{spans_key}_p",
- "SPAN R": f"spans_{spans_key}_r",
- "SPAN F": f"spans_{spans_key}_f",
- "SPEED": "speed",
- }
- results = {}
- data = {}
- for metric, key in metrics.items():
- if key in scores:
- if key == "cats_score":
- metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
- if isinstance(scores[key], (int, float)):
- if key == "speed":
- results[metric] = f"{scores[key]:.0f}"
+ scores = nlp.evaluate(dev_dataset, per_component=per_component)
+ if per_component:
+ data = scores
+ if output is None:
+ msg.warn(
+ "The per-component option is enabled but there is no output JSON file provided to save the scores to."
+ )
+ else:
+ msg.info("Per-component scores will be saved to output JSON file.")
+ else:
+ metrics = {
+ "TOK": "token_acc",
+ "TAG": "tag_acc",
+ "POS": "pos_acc",
+ "MORPH": "morph_acc",
+ "LEMMA": "lemma_acc",
+ "UAS": "dep_uas",
+ "LAS": "dep_las",
+ "NER P": "ents_p",
+ "NER R": "ents_r",
+ "NER F": "ents_f",
+ "TEXTCAT": "cats_score",
+ "SENT P": "sents_p",
+ "SENT R": "sents_r",
+ "SENT F": "sents_f",
+ "SPAN P": f"spans_{spans_key}_p",
+ "SPAN R": f"spans_{spans_key}_r",
+ "SPAN F": f"spans_{spans_key}_f",
+ "SPEED": "speed",
+ }
+ results = {}
+ data = {}
+ for metric, key in metrics.items():
+ if key in scores:
+ if key == "cats_score":
+ metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
+ if isinstance(scores[key], (int, float)):
+ if key == "speed":
+ results[metric] = f"{scores[key]:.0f}"
+ else:
+ results[metric] = f"{scores[key]*100:.2f}"
else:
- results[metric] = f"{scores[key]*100:.2f}"
- else:
- results[metric] = "-"
- data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
+ results[metric] = "-"
+ data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
- msg.table(results, title="Results")
- data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
+ msg.table(results, title="Results")
+ data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 9481e53be..e3ca73cfb 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
-{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
+{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths]
train = null
dev = null
@@ -28,7 +28,7 @@ lang = "{{ lang }}"
tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%}
{%- set full_pipeline = components -%}
@@ -127,6 +127,30 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
{% endif -%}
+{% if "span_finder" in components -%}
+[components.span_finder]
+factory = "span_finder"
+max_length = null
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.span_finder.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{% endif -%}
+
{% if "spancat" in components -%}
[components.spancat]
factory = "spancat"
@@ -392,6 +416,27 @@ nO = null
width = ${components.tok2vec.model.encode.width}
{% endif %}
+{% if "span_finder" in components %}
+[components.span_finder]
+factory = "span_finder"
+max_length = null
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif %}
+
{% if "spancat" in components %}
[components.spancat]
factory = "spancat"
diff --git a/spacy/errors.py b/spacy/errors.py
index f9bee07c9..cdc0ea380 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -975,7 +975,14 @@ class Errors(metaclass=ErrorsWithCodes):
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
- E1052 = ("Cannot create Language instance from config: missing pipeline components. The following components were added by instance (rather than config) via the 'Language.add_pipe_instance()' method, but are not present in the 'pipe_instances' variable: {names}")
+ E1052 = ("Unable to copy spans: the character offsets for the span at "
+ "index {i} in the span group do not align with the tokenization "
+ "in the target doc.")
+ E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
+ " 'min_length': {min_length}, 'max_length': {max_length}")
+ E1054 = ("The text, including whitespace, must match between reference and "
+ "predicted docs when training {component}.")
+ E1055 = ("Cannot create Language instance from config: missing pipeline components. The following components were added by instance (rather than config) via the 'Language.add_pipe_instance()' method, but are not present in the 'pipe_instances' variable: {names}")
# Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/lang/ms/__init__.py b/spacy/lang/ms/__init__.py
new file mode 100644
index 000000000..31a58a7e6
--- /dev/null
+++ b/spacy/lang/ms/__init__.py
@@ -0,0 +1,24 @@
+from .stop_words import STOP_WORDS
+from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS
+from .syntax_iterators import SYNTAX_ITERATORS
+from ...language import Language, BaseDefaults
+
+
+class MalayDefaults(BaseDefaults):
+ tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+ prefixes = TOKENIZER_PREFIXES
+ suffixes = TOKENIZER_SUFFIXES
+ infixes = TOKENIZER_INFIXES
+ syntax_iterators = SYNTAX_ITERATORS
+ lex_attr_getters = LEX_ATTRS
+ stop_words = STOP_WORDS
+
+
+class Malay(Language):
+ lang = "ms"
+ Defaults = MalayDefaults
+
+
+__all__ = ["Malay"]
diff --git a/spacy/lang/ms/_tokenizer_exceptions_list.py b/spacy/lang/ms/_tokenizer_exceptions_list.py
new file mode 100644
index 000000000..fba1dd70f
--- /dev/null
+++ b/spacy/lang/ms/_tokenizer_exceptions_list.py
@@ -0,0 +1,1943 @@
+# from https://prpm.dbp.gov.my/cari1?keyword=
+# dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka
+MS_BASE_EXCEPTIONS = set(
+ """
+aba-aba
+abah-abah
+abar-abar
+abrit-abritan
+abu-abu
+abuk-abuk
+abun-abun
+acak-acak
+acak-acakan
+acang-acang
+aci-aci
+aci-acian
+aci-acinya
+adang-adang
+adap-adapan
+adik-beradik
+aduk-adukan
+agak-agak
+agar-agar
+agut-agut
+air-cooled
+ajar-ajar
+aji-aji
+akal-akal
+akhir-akhir
+aki-aki
+alah-mengalahi
+alan-alan
+alang-alang
+alang-alangan
+alap-alap
+ali-ali
+alih-alih
+aling-aling
+aling-alingan
+alip-alipan
+alon-alon
+alu-alu
+alu-aluan
+alun-alun
+alur-alur
+ambah-ambah
+ambai-ambai
+ambil-mengambil
+ambring-ambringan
+ambu-ambu
+ambung-ambung
+amin-amin
+ampai-ampai
+amung-amung
+anai-anai
+anak-anak
+anak-anakan
+anak-beranak
+ancak-ancak
+ancang-ancang
+andang-andang
+angan-angan
+anggar-anggar
+angin-angin
+angin-anginan
+angkul-angkul
+angkup-angkup
+angkut-angkut
+ani-ani
+aning-aning
+anjang-anjang
+anjing-anjing
+anjung-anjung
+anjung-anjungan
+antar-antar
+ante-mortem
+anting-anting
+antung-antung
+anyam-menganyam
+apa-apa
+api-api
+apit-apit
+aprit-apritan
+arah-arah
+arak-arakan
+aram-aram
+ari-ari
+aru-aru
+asa-asaan
+asam-asaman
+asuh-asuh
+atas-mengatasi
+ati-ati
+audio-visual
+avant-garde
+awang-awang
+awang-gemawang
+ayak-ayak
+ayam-ayam
+ayam-ayaman
+ayang-ayang
+ayeng-ayengan
+ayun-temayun
+back-up
+bahu-membahu
+baik-baik
+bajang-bajang
+baji-baji
+balai-balai
+balam-balam
+balas-membalas
+baling-baling
+balut-balut
+bangun-bangun
+bantal-bantal
+barat-barat
+barau-barau
+bari-bari
+barung-barung
+basa-basi
+bata-bata
+batir-batir
+bau-bauan
+bayang-bayang
+bedil-bedal
+begana-begini
+bekal-bekalan
+belat-belit
+belu-belai
+benggal-benggil
+bengkal-bengkil
+bengkang-bengkok
+bengkang-bengkong
+berabad-abad
+berabun-rabun
+berada-ada
+beragah-agah
+beragak-agak
+beragam-ragam
+beraja-raja
+berakit-rakit
+beraku-akuan
+beralun-alun
+beramah-ramahan
+beramah-tamah
+beramai-ramai
+berambai-ambai
+berambal-ambalan
+beramuk-amukan
+berandai-andai
+berandai-randai
+berang-berang
+berangan-angan
+beranggap-anggapan
+berangguk-angguk
+berangin-angin
+berangka-angka
+berangka-angkaan
+berangkai-rangkai
+beranja-anja
+berantai-rantai
+berapi-api
+berapung-apung
+berarak-arakan
+beras-beras
+berasing-asingan
+beratus-ratus
+berawas-awas
+berayal-ayalan
+berayun-ayun
+berbagai-bagai
+berbahas-bahasan
+berbalas-balasan
+berbalik-balik
+berbanjar-banjar
+berbantah-bantah
+berbanyak-banyak
+berbarik-barik
+berbasah-basah
+berbatu-batu
+berbayang-bayang
+berbecak-becak
+berbedil-bedilan
+berbeka-beka
+berbelakang-belakangan
+berbelang-belang
+berbeli-belian
+berbelit-belit
+berbelok-belok
+berbenar-benar
+berbencah-bencah
+berbesar-besar
+berbidai-bidai
+berbiku-biku
+berbilik-bilik
+berbinar-binar
+berbincang-bincang
+berbingkah-bingkah
+berbintang-bintang
+berbintik-bintik
+berbintil-bintil
+berbisik-bisik
+berbolak-balik
+berbolong-bolong
+berbondong-bondong
+berbongkah-bongkah
+berbuai-buai
+berbual-bual
+berbukit-bukit
+berbulan-bulan
+berbunga-bunga
+berbuntut-buntut
+berbunuh-bunuhan
+berburu-buru
+berburuk-buruk
+berbutir-butir
+bercabang-cabang
+bercaci-cacian
+bercakap-cakap
+bercakar-cakaran
+bercantik-cantik
+bercari-cari
+bercari-carian
+bercarik-carik
+bercepat-cepat
+bercerai-berai
+bercerai-cerai
+bercetai-cetai
+bercikun-cikun
+bercinta-cintaan
+bercita-cita
+berciut-ciut
+berconteng-conteng
+bercoreng-coreng
+bercoreng-moreng
+bercuit-cuit
+bercumbu-cumbu
+bercumbu-cumbuan
+bercura-bura
+bercura-cura
+berdada-dadaan
+berdahulu-dahuluan
+berdalam-dalam
+berdebar-debar
+berdecap-decap
+berdedai-dedai
+berdegap-degap
+berdegar-degar
+berdeham-deham
+berdekah-dekah
+berdekat-dekat
+berdelat-delat
+berdembun-dembun
+berdempang-dempang
+berdendam-dendaman
+berdengkang-dengkang
+berdentang-dentang
+berdentum-dentum
+berdentung-dentung
+berdepak-depak
+berdepan-depan
+berderai-derai
+berderak-derak
+berderau-derau
+berdering-dering
+berderung-derung
+berdesak-desakan
+berdesing-desing
+berdesus-desus
+berdikit-dikit
+berdingkit-dingkit
+berdua-dua
+berduri-duri
+berduru-duru
+berduyun-duyun
+berebut-rebut
+berebut-rebutan
+beregang-regang
+berek-berek
+berembut-rembut
+berempat-empat
+berenak-enak
+berenteng-renteng
+beresah-resah
+berfoya-foya
+bergagah-gagahan
+bergagap-gagap
+bergalur-galur
+berganda-ganda
+berganti-ganti
+bergarah-garah
+bergaruk-garuk
+bergegas-gegas
+bergelang-gelang
+bergelap-gelap
+bergelas-gelasan
+bergeleng-geleng
+bergemal-gemal
+bergembut-gembut
+bergerek-gerek
+bergesa-gesa
+bergilir-gilir
+bergolek-golek
+bergores-gores
+bergotong-royong
+bergugus-gugus
+bergulung-gulung
+bergulut-gulut
+bergumpal-gumpal
+bergunung-gunung
+berhadap-hadapan
+berhamun-hamun
+berhandai-handai
+berhanyut-hanyut
+berhari-hari
+berhati-hati
+berhilau-hilau
+berhujan-hujan
+beria-ia
+beria-ria
+beriak-riak
+beribu-ribu
+berigi-rigi
+bering-bering
+beringat-ingat
+beringgit-ringgit
+berintik-rintik
+beriring-iring
+beriring-iringan
+berjabir-jabir
+berjaga-jaga
+berjagung-jagung
+berjalan-jalan
+berjalar-jalar
+berjalin-jalin
+berjalur-jalur
+berjam-jam
+berjauh-jauhan
+berjejal-jejal
+berjela-jela
+berjenis-jenis
+berjenjang-jenjang
+berjilid-jilid
+berjinak-jinak
+berjingkat-jingkat
+berjingkrak-jingkrak
+berjongkok-jongkok
+berjubel-jubel
+berjujut-jujutan
+berjulai-julai
+berjumbai-jumbai
+berjurai-jurai
+berjurus-jurus
+berjuta-juta
+berkaca-kaca
+berkait-kaitan
+berkala-kala
+berkali-kali
+berkanjar-kanjar
+berkaok-kaok
+berkarung-karung
+berkasih-kasihan
+berkata-kata
+berkatak-katak
+berkecai-kecai
+berkecek-kecek
+berkecil-kecil
+berkecil-kecilan
+berkedip-kedip
+berkejang-kejang
+berkejap-kejap
+berkejar-kejaran
+berkelar-kelar
+berkelip-kelip
+berkelit-kelit
+berkelok-kelok
+berkelompok-kelompok
+berkelun-kelun
+berkembur-kembur
+berkempul-kempul
+berkena-kenaan
+berkenal-kenalan
+berkendur-kendur
+berkeok-keok
+berkepak-kepak
+berkepal-kepal
+berkeping-keping
+berkepul-kepul
+berkeras-kerasan
+berkeritik-keritik
+berkeruit-keruit
+berkerut-kerut
+berketak-ketak
+berketak-ketik
+berketi-keti
+berketil-ketil
+berketuk-ketak
+berketul-ketul
+berkial-kial
+berkian-kian
+berkias-kiasan
+berkibar-kibar
+berkilah-kilah
+berkilat-kilat
+berkilau-kilauan
+berkilo-kilo
+berkinja-kinja
+berkipas-kipas
+berkira-kira
+berkirim-kiriman
+berkobar-kobar
+berkobok-kobok
+berkocak-kocak
+berkodi-kodi
+berkolek-kolek
+berkopah-kopah
+berkotak-kotak
+berkuat-kuatan
+berkunang-kunang
+berkurun-kurun
+berkusau-kusau
+berkusu-kusu
+berkusut-kusut
+berkuting-kuting
+berkutu-kutuan
+berlabun-labun
+berlain-lainan
+berlalai-lalai
+berlama-lama
+berlambai-lambai
+berlambak-lambak
+berlampang-lampang
+berlapang-lapang
+berlapis-lapis
+berlapuk-lapuk
+berlarah-larah
+berlarat-larat
+berlari-larian
+berlarik-larik
+berlarut-larut
+berlawak-lawak
+berlayap-layapan
+berlebih-lebih
+berlebih-lebihan
+berlekas-lekas
+berlena-lena
+berlengah-lengah
+berlenggek-lenggek
+berlenggok-lenggok
+berleret-leret
+berliang-liuk
+berliku-liku
+berlimpah-limpah
+berlimpap-limpap
+berlimpit-limpit
+berlinang-linang
+berlindak-lindak
+berlipat-lipat
+berlompok-lompok
+berloncat-loncatan
+berlopak-lopak
+berlubang-lubang
+bermaaf-maafan
+bermacam-macam
+bermain-main
+bermalas-malas
+bermanik-manik
+bermanis-manis
+bermanja-manja
+bermasak-masak
+bermati-mati
+bermegah-megah
+bermemek-memek
+bermesra-mesraan
+bermewah-mewah
+berminggu-minggu
+berminta-minta
+bermuda-muda
+bermudah-mudah
+bermuka-muka
+bermula-mula
+bermulut-mulut
+bernafsi-nafsi
+bernaka-naka
+berniat-niat
+berogak-ogak
+beroleng-oleng
+berolok-olok
+beromong-omong
+beronggok-onggok
+berorang-orang
+beroyal-royal
+berpada-pada
+berpahit-pahit
+berpair-pair
+berpal-pal
+berpalu-palu
+berpalu-paluan
+berpalun-palun
+berpandai-pandai
+berpandang-pandangan
+berpangkat-pangkat
+berpanjang-panjang
+berpasang-pasang
+berpasang-pasangan
+berpayah-payah
+berpeluh-peluh
+berpeluk-pelukan
+berpenat-penat
+berpencar-pencar
+berpendar-pendar
+berpenggal-penggal
+berperai-perai
+berpesai-pesai
+berpesta-pesta
+berpesuk-pesuk
+berpetak-petak
+berpeti-peti
+berpihak-pihak
+berpijar-pijar
+berpikul-pikul
+berpilih-pilih
+berpilin-pilin
+berpindah-pindah
+berpintal-pintal
+berpirau-pirau
+berpisah-pisah
+berpolah-polah
+berpongah-pongah
+berpontang-panting
+berporah-porah
+berpotong-potong
+berpuak-puak
+berpual-pual
+berpugak-pugak
+berpuluh-puluh
+berpulun-pulun
+berpuntal-puntal
+berpura-pura
+berpusar-pusar
+berpusing-pusing
+berpusu-pusu
+berputar-putar
+bersaf-saf
+bersahut-sahutan
+bersakit-sakit
+bersalah-salahan
+bersalam-salaman
+bersalin-salin
+bersama-sama
+bersambut-sambutan
+bersampan-sampan
+bersantai-santai
+bersapa-sapaan
+bersarang-sarang
+bersedan-sedan
+bersedia-sedia
+bersedu-sedu
+bersekat-sekat
+berselang-selang
+berselang-seli
+bersembur-semburan
+bersempit-sempit
+bersenang-senang
+bersenang-senangkan
+bersenda-senda
+bersendi-sendi
+bersepah-sepah
+bersepi-sepi
+berserak-serak
+berseri-seri
+bersesak-sesak
+bersetai-setai
+bersia-sia
+bersiap-siap
+bersiar-siar
+bersilir-silir
+bersimbur-simburan
+bersinau-sinau
+bersorak-sorai
+bersuap-suapan
+bersudah-sudah
+bersuka-suka
+bersuka-sukaan
+bersuku-suku
+bersumpah-sumpahan
+bersungguh-sungguh
+bersungut-sungut
+bersunyi-sunyi
+bersusah-susah
+bersusuk-susuk
+bersusuk-susukan
+bersutan-sutan
+bertabur-tabur
+bertahu-tahu
+bertahun-tahun
+bertajuk-tajuk
+bertakik-takik
+bertala-tala
+bertali-tali
+bertalu-talu
+bertambah-tambah
+bertanda-tandaan
+bertangis-tangisan
+bertangkil-tangkil
+bertanya-tanya
+bertarik-tarikan
+bertatai-tatai
+bertatih-tatih
+bertawan-tawan
+bertawar-tawaran
+bertebu-tebu
+bertebu-tebukan
+berteguh-teguh
+berteguh-teguhan
+berteka-teki
+bertelau-telau
+bertele-tele
+bertempat-tempat
+bertempuh-tempuh
+bertenang-tenang
+bertenggang-tenggangan
+bertentu-tentu
+bertepek-tepek
+berterang-terang
+berterang-terangan
+bertikam-tikaman
+bertimbal-timbalan
+bertimbun-timbun
+bertimpa-timpa
+bertimpas-timpas
+bertingkah-tingkah
+bertingkat-tingkat
+bertinjau-tinjauan
+bertiras-tiras
+bertitar-titar
+bertoboh-toboh
+bertolak-tolak
+bertolak-tolakan
+bertolong-tolongan
+bertonjol-tonjol
+bertua-tua
+bertua-tuaan
+bertual-tual
+bertubi-tubi
+bertukar-tukar
+bertukar-tukaran
+bertukas-tukas
+bertumpak-tumpak
+bertunda-tunda
+bertunjuk-tunjukan
+bertura-tura
+berturut-turut
+bertutur-tutur
+beruas-ruas
+berubah-ubah
+berulang-alik
+berulang-ulang
+berumbai-rumbai
+berundung-undung
+berunggas-runggas
+berungkur-ungkuran
+beruntai-untai
+beruntun-runtun
+berunyai-unyai
+berupa-rupa
+berura-ura
+beruris-uris
+berurut-urutan
+berwarna-warna
+berwarna-warni
+berwindu-windu
+berwiru-wiru
+beryang-yang
+besar-besaran
+betak-betak
+beti-beti
+betul-betul
+biang-biang
+biar-biar
+biji-bijian
+bila-bila
+bilang-bilang
+bincang-bincut
+bini-binian
+biri-biri
+biru-biru
+bisik-bisik
+biti-biti
+bolak-balik
+bolang-baling
+bongkar-bangkir
+buah-buahan
+buat-buatan
+buaya-buaya
+bubun-bubun
+bugi-bugi
+built-in
+bukan-bukan
+bulan-bulan
+bulan-bulanan
+bulang-bulang
+bulat-bulat
+buli-buli
+bulu-bulu
+buluh-buluh
+bulus-bulus
+bunga-bungaan
+bunuh-membunuh
+bunyi-bunyian
+buru-buru
+burung-burungan
+bye-bye
+cabik-cabik
+caing-caing
+calar-balar
+cara-cara
+carut-marut
+cawi-cawi
+cebar-cebur
+celam-celum
+celangak-celinguk
+celas-celus
+celedang-celedok
+celengkak-celengkok
+cemas-cemas
+centang-perenang
+cepat-cepat
+cerai-berai
+ceruk-menceruk
+ceruk-meruk
+check-up
+chit-chat
+cirit-birit
+cita-cita
+close-up
+closed-circuit
+cobak-cabik
+cobar-cabir
+cola-cala
+compang-camping
+congak-cangit
+congkah-cangkih
+congkah-mangkih
+copak-capik
+corak-carik
+corat-coret
+coreng-moreng
+cuang-caing
+cubung-cubung
+culik-culik
+cuma-cuma
+cumi-cumi
+cungap-cangip
+cupu-cupu
+dahulu-mendahului
+dali-dali
+dapur-dapur
+dari-dari
+daru-daru
+datang-datang
+datang-mendatangi
+daun-daunan
+dawai-dawai
+dayang-dayang
+degap-degap
+dekak-dekak
+dekat-dekat
+dengar-dengaran
+desas-desus
+diam-diam
+do-it-yourself
+dokok-dokok
+dolak-dalik
+dorong-mendorong
+drive-in
+dua-dua
+dua-duanya
+duduk-duduk
+dulang-dulang
+ecek-ecek
+embuh-embuhan
+empek-empek
+empok-empok
+encal-encal
+endap-endap
+endut-endutan
+engah-engah
+enggan-enggan
+engkah-engkah
+entah-berentah
+erang-erot
+erong-erong
+fast-food
+fifty-fifty
+flip-flop
+follow-up
+foya-foya
+gaba-gaba
+gabai-gabai
+gada-gada
+gading-gading
+gado-gado
+gajah-gajahan
+gala-gala
+gali-galian
+galing-galing
+galu-galu
+gamit-gamitan
+gampang-gampangan
+ganal-ganal
+ganda-berganda
+gapah-gopoh
+gara-gara
+garah-garah
+gatal-gatal
+gawar-gawar
+gaya-gayanya
+gedebak-gedebuk
+gelang-gelang
+gelembung-gelembungan
+geli-geli
+geliang-geliut
+geliat-geliut
+gempul-gempul
+gendang-gendang
+genjang-genjot
+gerabak-gerubuk
+gerak-gerik
+gerbas-gerbus
+gerit-gerit
+geruh-gerah
+getak-getuk
+geti-geti
+gila-gila
+gila-gilaan
+gilang-gemilang
+gilap-gemilap
+gili-gili
+giling-giling
+ginang-ginang
+girik-girik
+giring-giring
+go-kart
+golak-galik
+gonta-ganti
+gotong-royong
+gual-gail
+gudu-gudu
+gula-gula
+gulang-gulang
+guna-guna
+guntang-guntang
+gunung-ganang
+gunung-gemunung
+gunung-gunungan
+habis-habis
+habis-habisan
+halai-balai
+half-time
+hampir-hampir
+harap-harapan
+harum-haruman
+hati-hati
+heavy-duty
+hebat-hebatan
+hidup-hidup
+hiru-biru
+hiruk-pikuk
+hubaya-hubaya
+hula-hula
+huru-hara
+ibar-ibar
+icak-icak
+igau-igauan
+ikut-ikut
+ikut-ikutan
+ilam-ilam
+imbang-imbangan
+inang-inang
+inca-binca
+incang-incut
+ingat-ingat
+ingat-ingatan
+ingau-ingauan
+inggang-inggung
+injak-injak
+iras-iras
+iring-iringan
+iseng-iseng
+jadi-jadian
+jala-jala
+jamah-jamahan
+jambu-jambu
+jangan-jangan
+jarang-jarang
+jari-jari
+jaring-jaring
+jarum-jarum
+jauh-jauh
+jawi-jawi
+jebat-jebatan
+jelur-jelir
+jendal-jendul
+jenggar-jenggur
+jentik-jentik
+jerah-jerih
+jolong-jolong
+jongkar-jangkir
+juak-juak
+juang-juang
+julung-julung
+jurai-jurai
+kabu-kabu
+kacang-kacang
+kacang-kacangan
+kacau-balau
+kadang-kadang
+kail-kail
+kait-kait
+kakek-kakek
+kalau-kalau
+kaleng-kalengan
+kalut-malut
+kambing-kambing
+kanak-kanak
+kapa-kapa
+kapan-kapan
+kapu-kapu
+karang-karangan
+karang-mengarang
+kareseh-peseh
+karut-marut
+katang-katang
+kawa-kawa
+kayu-kayuan
+keabu-abuan
+keasyik-asyikan
+kebarat-baratan
+kebasah-basahan
+kebat-kebit
+kebata-bataan
+kebelanda-belandaan
+kebiru-biruan
+kebudak-budakan
+kecil-kecilan
+kecil-mengecil
+kecuh-kecah
+kedek-kedek
+kegadis-gadisan
+kegelap-gelapan
+kegila-gilaan
+kegirang-girangan
+kehijau-hijauan
+kehitam-hitaman
+kejaga-jagaan
+kejingga-jinggaan
+kekabur-kaburan
+kekanak-kanakan
+kekoboi-koboian
+kekuning-kuningan
+kelak-kelik
+kelak-keluk
+kelaki-lakian
+kelang-kelok
+kelap-kelip
+kelek-kelek
+kelek-kelekan
+kelik-kelik
+kelip-kelip
+kelusuh-kelasah
+kelut-melut
+kemak-kemik
+kemalu-maluan
+kemanja-manjaan
+kemarah-marahan
+kemasam-masaman
+kemati-matian
+kemerah-merahan
+kempang-kempis
+kempas-kempis
+kemuda-mudaan
+kena-mengena
+kenal-mengenal
+kenang-kenangan
+kencang-kencung
+kendang-kendang
+kendang-kendangan
+kentung-kentung
+kenyat-kenyit
+kepandir-pandiran
+kepang-kepot
+keperak-perakan
+kepilu-piluan
+kepura-puraan
+keputih-putihan
+kerah-kerahan
+kerancak-rancakan
+kerang-kerangan
+kerang-keroh
+kerang-kerung
+kerap-kerap
+keras-mengerasi
+kercap-kercip
+kercap-kercup
+keriang-keriut
+kernyat-kernyut
+kerong-kerong
+keropas-kerapis
+kertak-kertuk
+keruntang-pungkang
+kesap-kesip
+kesenak-senakan
+kesewenang-wenangan
+kesia-siaan
+kesik-kesik
+kesipu-sipuan
+kesu-kesi
+kesuh-kesih
+kesuk-kesik
+ketergesa-gesaan
+keti-keti
+ketidur-tiduran
+ketiga-tiganya
+ketua-tuaan
+ketuan-tuanan
+keungu-unguan
+kia-kia
+kiak-kiak
+kial-kial
+kiang-kiut
+kibang-kibut
+kicang-kecoh
+kicang-kicu
+kida-kida
+kilau-mengilau
+kili-kili
+kira-kira
+kira-kiraan
+kisi-kisi
+kocah-kacih
+kodok-kodok
+kolang-kaling
+koleh-koleh
+kolong-kolong
+koma-koma
+komat-kamit
+kontal-kantil
+kontang-kanting
+kosak-kasik
+kotak-katik
+kotak-kotak
+kuat-kuat
+kucar-kacir
+kucing-kucing
+kucing-kucingan
+kuda-kuda
+kuda-kudaan
+kudap-kudap
+kulah-kulah
+kulak-kulak
+kulik-kulik
+kulum-kulum
+kumat-kamit
+kunang-kunang
+kupat-kapit
+kupu-kupu
+kura-kura
+kurang-kurang
+kusat-mesat
+kutat-kutet
+kuti-kuti
+labi-labi
+labu-labu
+lagi-lagi
+laguh-lagah
+laki-laki
+lalu-lalang
+lama-kelamaan
+lama-lama
+lamat-lamat
+lambat-lambat
+lancar-lancar
+langak-longok
+langit-langit
+lanja-lanjaan
+lapat-lapat
+large-scale
+lari-lari
+lauk-pauk
+lawah-lawah
+lawak-lawak
+lawi-lawi
+layang-layang
+layu-layuan
+lebih-lebih
+legak-legok
+lekak-lekuk
+lekap-lekup
+lekas-lekas
+lekuh-lekih
+lekup-lekap
+lenggak-lenggok
+lenggok-lenggok
+lengket-lengket
+lentam-lentum
+lentang-lentok
+lentang-lentung
+lepa-lepa
+lerang-lerang
+lereng-lereng
+letah-letai
+letup-letup
+liang-liuk
+lidah-lidah
+line-up
+liuk-liuk
+liung-liung
+lobi-lobi
+lock-up
+lopak-lapik
+lopak-lopak
+lumba-lumba
+lumi-lumi
+luntang-lantung
+lupa-lupa
+lupa-lupaan
+main-mainan
+makan-makanan
+make-up
+malai-malai
+malam-malam
+malar-malar
+mali-mali
+malu-malu
+mana-mana
+manik-manik
+manis-manisan
+mark-up
+masing-masing
+mata-mata
+mati-matian
+maya-maya
+megap-megap
+megrek-megrek
+melak-melak
+melambai-lambai
+melambai-lambaikan
+melambat-lambatkan
+melaun-laun
+melawak-lawak
+melayap-layap
+melayap-layapkan
+melebih-lebihi
+melebih-lebihkan
+melejang-lejangkan
+melengah-lengah
+melihat-lihat
+melimpah-limpah
+melincah-lincah
+meloncat-loncat
+melonco-lonco
+melonjak-lonjak
+memacak-macak
+memaki-maki
+memaksa-maksa
+memandai-mandai
+memanggil-manggil
+memanis-manis
+memanjut-manjut
+memasak-masak
+memata-matai
+mematah-matah
+mematut-matut
+memayah-mayahkan
+membagi-bagikan
+membalik-balik
+membangkit-bangkit
+membayang-bayangi
+membayang-bayangkan
+membelai-belai
+membenar-benar
+membenar-benari
+memberai-beraikan
+membesar-besarkan
+membolak-balikkan
+membuang-buang
+membuat-buat
+membunga-bungai
+memburu-buru
+memburu-burukan
+memburuk-burukkan
+memencak-mencak
+memencar-mencar
+memetak-metak
+memetang-metangkan
+memetir-metir
+memikir-mikirkan
+memilih-milih
+meminang-minang
+meminta-minta
+memisah-misahkan
+memontang-mantingkan
+memperamat-amat
+memperamat-amatkan
+memperbagai-bagaikan
+memperganda-gandakan
+memperganduh-ganduhkan
+mempermacam-macamkan
+memperolok-olokkan
+mempersama-samakan
+mempertubi-tubi
+mempertubi-tubikan
+memperturut-turutkan
+memuja-muja
+memukang-mukang
+memulun-mulun
+memundi-mundi
+memundi-mundikan
+memuyu-muyu
+menagak-nagak
+menakut-nakuti
+menanjur-nanjur
+menanti-nanti
+menari-nari
+mencabik-cabik
+mencabik-cabikkan
+mencaing-caing
+mencak-mencak
+mencakup-cakup
+mencapak-capak
+mencari-cari
+mencarik-carik
+mencarut-carut
+mencengis-cengis
+mencepak-cepak
+mencepuk-cepuk
+mencerai-beraikan
+mencetai-cetai
+menciap-ciap
+menciar-ciar
+mencita-citakan
+menciut-ciut
+mencoang-coang
+mencubit-cubit
+mencuri-curi
+mendecap-decap
+mendengking-dengking
+menderak-derakkan
+menderau-derau
+menderu-deru
+mendesas-desuskan
+mendesus-desus
+mendewa-dewakan
+mendudu-dudu
+menebu-nebu
+menegur-neguri
+mengabung-ngabung
+mengaci-acikan
+mengada-ada
+mengaduk-aduk
+mengagak-agak
+mengagak-agihkan
+mengagut-agut
+mengais-ngais
+mengali-ali
+mengalur-alur
+mengamang-amang
+mengamat-amati
+mengambai-ambaikan
+mengambang-ambang
+mengancak-ancak
+mengangan-angankan
+mengangguk-angguk
+mengangin-anginkan
+mengangkat-angkat
+mengap-mengap
+mengapa-apai
+mengapi-apikan
+mengarah-arahi
+mengata-ngatai
+mengaum-aumkan
+mengejan-ejan
+mengelai-ngelai
+mengelepik-ngelepik
+mengelus-elus
+mengembut-embut
+mengenap-enapkan
+mengenjak-enjak
+mengepak-ngepak
+mengepak-ngepakkan
+menggaba-gabai
+menggalur-galur
+menggamak-gamak
+menggapai-gapai
+menggapai-gapaikan
+menggelepar-gelepar
+menggelepar-geleparkan
+menggemak-gemak
+menggerecak-gerecak
+menggesa-gesakan
+menggili-gili
+menggorek-gorek
+menggosok-gosok
+mengguit-guit
+menghalai-balaikan
+menghinap-hinap
+mengiang-ngiang
+mengibas-ngibas
+mengidam-idamkan
+mengilah-ngilahkan
+mengilai-ilai
+mengilat-ngilatkan
+mengilik-ngilik
+mengimak-imak
+mengiming-iming
+menginjak-injak
+mengipas-ngipas
+mengira-ngira
+mengira-ngirakan
+mengiras-iras
+mengiras-irasi
+mengitar-ngitar
+mengitik-ngitik
+mengogok-ogok
+mengolak-alikkan
+mengoleng-oleng
+mengongkang-ongkang
+mengongkok-ongkok
+mengonyah-anyih
+mengotak-ngatikkan
+mengoyak-ngoyakkan
+mengoyak-oyak
+menguar-nguarkan
+menguar-uarkan
+menguber-uber
+mengubit-ubit
+mengubrak-abrik
+mengucar-ngacirkan
+mengucek-ngucek
+menguik-uik
+menguis-uis
+mengulit-ulit
+menguman-uman
+mengumbang-ambingkan
+mengumpak-umpak
+mengungkat-ungkat
+mengungkit-ungkit
+mengurik-urik
+mengutak-ngatikkan
+mengutik-ngutik
+menimang-nimang
+meningkat-ningkat
+meniru-niru
+meniup-niup
+menjadi-jadi
+menjengek-jengek
+menjengit-jengit
+menjilat-jilat
+mentah-mentah
+mentang-mentang
+menunda-nunda
+menusuk-nusuk
+menyama-nyama
+menyambar-nyambar
+menyanjung-nyanjung
+menyapu-nyapu
+menyarat-nyarat
+menyendi-nyendi
+menyeret-nyeret
+menyeru-nyerukan
+menyia-nyiakan
+menyungguh-nyungguhi
+meraba-raba
+merangkak-rangkak
+merasa-rasai
+meraung-raung
+meraung-raungkan
+merayau-rayau
+merayu-rayu
+mereka-reka
+merelap-relap
+meremah-remah
+meremeh-temehkan
+merempah-rempahi
+merengek-rengek
+merenik-renik
+merenta-renta
+merenyai-renyai
+merintang-rintang
+merintik-rintik
+merobek-robek
+meronta-ronta
+merungus-rungus
+merungut-rungut
+mewarna-warnikan
+meyakin-yakini
+miju-miju
+minta-minta
+moga-moga
+morat-marit
+muda-mudi
+mudah-mudahan
+muka-muka
+mula-mula
+muluk-muluk
+naga-naga
+nanti-nantian
+nasi-nasi
+nasib-nasiban
+nenek-nenek
+nyolong-nyolong
+ogah-ogahan
+ogak-ogak
+olak-alik
+olak-olak
+olang-aling
+olang-alingan
+oleh-oleh
+olok-olok
+olok-olokan
+olong-olong
+on-screen
+onde-onde
+one-to-one
+oneng-oneng
+ongkang-ongkang
+ongol-ongol
+onyah-anyih
+orak-arik
+orang-aring
+orang-orangan
+orok-orok
+orong-orong
+otak-otak
+otak-otakan
+padi-padian
+pagi-pagi
+palas-palas
+paling-paling
+palu-memalu
+panas-panas
+pandang-memandang
+panji-panji
+para-para
+paru-paru
+pasang-memasang
+pasu-pasu
+paya-paya
+pecah-pecah
+pelan-pelan
+pengundang-undang
+perang-perangan
+perintang-rintang
+perlahan-lahan
+perlip-perlipan
+pertama-tama
+perundang-undangan
+pesan-pesan
+piat-piut
+pick-up
+pijak-pijak
+pijar-pijar
+pijat-pijat
+pina-pina
+pisang-pisang
+play-off
+pohon-pohonan
+pokrol-pokrolan
+polang-paling
+poma-poma
+pontang-panting
+porak-parik
+porak-peranda
+potong-memotong
+puji-pujian
+pukang-pukang
+pukul-memukul
+pulang-pergi
+pulut-pulut
+pundi-pundi
+punggung-memunggung
+pura-pura
+pusar-pusar
+push-up
+pusing-pusing
+putus-putus
+rada-rada
+radio-frequency
+ragu-ragu
+rama-rama
+rambu-rambu
+rango-rango
+rasa-rasanya
+rata-rata
+real-time
+rebah-rebah
+rebah-rebahan
+redam-redam
+reka-reka
+reka-rekaan
+remah-remah
+remang-remang
+rembah-rembih
+remeh-temeh
+rempah-rempah
+repuh-repuh
+riang-riang
+ribu-ribu
+rigi-rigi
+robak-rabik
+robat-rabit
+role-play
+roll-on
+rombang-rambing
+ruak-ruak
+ruku-ruku
+rumah-rumah
+rumah-rumahan
+rumput-rumputan
+runding-merunding
+runggu-rangga
+runner-up
+rupa-rupa
+rupa-rupanya
+saban-saban
+sabung-menyabung
+saing-menyaing
+salah-salah
+sama-sama
+samar-samar
+sambar-menyambar
+sambung-bersambung
+sambung-menyambung
+sambut-menyambut
+sampai-sampai
+sandar-menyandar
+sangat-sangat
+sangkut-menyangkut
+sapa-menyapa
+sapu-sapu
+sarit-sarit
+satu-satu
+satu-satunya
+sayup-menyayup
+sayup-sayup
+sayur-mayur
+sayur-sayuran
+sci-fi
+seakal-akal
+seakan-akan
+sealak-alak
+sebaik-baiknya
+sebelah-menyebelah
+sebentar-sebentar
+seberang-menyeberang
+seboleh-bolehnya
+sedalam-dalamnya
+sedang-menyedang
+sedap-sedapan
+sedapat-dapatnya
+sedikit-dikitnya
+sedikit-sedikit
+sedikit-sedikitnya
+seelok-eloknya
+segala-galanya
+segan-menyegan
+segan-menyegani
+segan-segan
+sehari-hari
+sehari-harian
+sejadi-jadinya
+sekali-kali
+sekali-sekali
+sekira-kira
+sekonyong-konyong
+sekuasa-kuasanya
+sekurang-kurangnya
+sela-menyela
+sela-sela
+selama-lamanya
+selambat-lambatnya
+selang-seli
+selang-seling
+selar-belar
+selat-latnya
+selekas-lekasnya
+selepas-lepas
+self-esteem
+self-help
+sema-sema
+semah-semah
+semak-semak
+semalam-malaman
+semasa-masa
+semata-mata
+sembunyi-sembunyi
+sembunyi-sembunyian
+semena-mena
+semenda-menyemenda
+semengga-mengga
+sementang-mentang
+semu-semu
+semut-semutan
+sengal-sengal
+sengau-sengauan
+seolah-olah
+sepala-pala
+sepandai-pandai
+sepetang-petangan
+sepoi-sepoi
+sepuas-puasnya
+serang-menyerang
+seraya-menyeraya
+serba-serbi
+serbah-serbih
+serembah-serembih
+sering-sering
+serta-menyertai
+serta-serta
+sesal-menyesali
+sesudah-sudah
+sesudah-sudahnya
+sesuka-suka
+setempat-setempat
+setengah-setengah
+setidak-tidaknya
+seupaya-upaya
+seupaya-upayanya
+sewaktu-waktu
+sewenang-wenang
+short-term
+sia-sia
+siang-siang
+siapa-siapa
+sibar-sibar
+sibur-sibur
+sida-sida
+siku-siku
+silah-silah
+silang-menyilang
+silir-semilir
+sinar-seminar
+sindir-menyindir
+singgah-menyinggah
+sorak-sorai
+stand-by
+stand-up
+sudu-sudu
+sudung-sudung
+suka-suka
+sulang-menyulang
+sulur-suluran
+sumpah-sumpah
+sumpit-sumpit
+sungguh-sungguh
+sungut-sungut
+suram-suram
+surat-menyurat
+suruh-suruhan
+tabar-tabar
+tabir-mabir
+tabrak-tubruk
+tabuh-tabuhan
+tahu-menahu
+tahu-tahu
+takang-takik
+take-off
+takut-takut
+takut-takutan
+tali-bertali
+tali-tali
+tampak-tampak
+tanam-menanam
+tanam-tanaman
+tanda-tanda
+tangan-menangan
+tangan-tangan
+tanggung-menanggung
+tapa-tapa
+tapak-tapak
+tari-menari
+tari-tarian
+tarik-menarik
+tatah-tatah
+tawak-tawak
+tawang-tawang
+tawar-menawar
+tawar-tawar
+tayum-temayum
+tebu-tebu
+tegak-tegak
+teka-teki
+temas-temas
+tembak-menembak
+temut-temut
+tenggang-menenggang
+teraba-raba
+terambang-ambang
+terang-terang
+terang-terangan
+teranggar-anggar
+terangguk-angguk
+teranggul-anggul
+terangin-angin
+terangkup-angkup
+teranja-anja
+terapung-apung
+terayan-rayan
+terayap-rayap
+terbada-bada
+terbahak-bahak
+terbata-bata
+terbatuk-batuk
+terbayang-bayang
+terbengkil-bengkil
+terbirit-birit
+terbuai-buai
+terbuang-buang
+terburu-buru
+tercangak-cangak
+tercengang-cengang
+tercilap-cilap
+tercongget-congget
+tercungap-cungap
+terdangka-dangka
+terdengih-dengih
+terekeh-ekeh
+terembut-embut
+terembut-rembut
+terengah-engah
+teresak-esak
+tergagap-gagap
+tergagau-gagau
+tergaguk-gaguk
+tergapai-gapai
+tergegap-gegap
+tergegas-gegas
+tergelung-gelung
+tergerenyeng-gerenyeng
+tergesa-gesa
+tergila-gila
+tergontai-gontai
+tergudik-gudik
+terguling-guling
+tergulut-gulut
+terharak-harak
+terharap-harap
+terhengit-hengit
+terhinggut-hinggut
+terigau-igau
+terincut-incut
+teringa-inga
+teringat-ingat
+terinjak-injak
+terjembak-jembak
+terjerit-jerit
+terkadang-kadang
+terkakah-kakah
+terkakak-kakak
+terkanjar-kanjar
+terkapah-kapah
+terkapai-kapai
+terkapung-kapung
+terkatah-katah
+terkatung-katung
+terkecap-kecap
+terkedek-kedek
+terkedip-kedip
+terkejar-kejar
+terkekau-kekau
+terkekeh-kekeh
+terkekek-kekek
+terkelinjat-kelinjat
+terkelip-kelip
+terkempul-kempul
+terkemut-kemut
+terkencar-kencar
+terkepak-kepak
+terkesot-kesot
+terkesut-kesut
+terkial-kial
+terkincak-kincak
+terkindap-kindap
+terkinja-kinja
+terkirai-kirai
+terkitar-kitar
+terkocoh-kocoh
+terkokol-kokol
+terkosel-kosel
+terkoteng-koteng
+terkumpal-kumpal
+terlara-lara
+terlayang-layang
+terlebih-lebih
+terlincah-lincah
+terliuk-liuk
+terlolong-lolong
+terlongong-longong
+termangu-mangu
+termanja-manja
+termata-mata
+termengah-mengah
+termimpi-mimpi
+ternanti-nanti
+terngiang-ngiang
+teroleng-oleng
+terpandang-pandang
+terpecah-pecah
+terpekik-pekik
+terpereh-pereh
+terpikau-pikau
+terpinga-pinga
+terpingkal-pingkal
+terpontang-panting
+terpusing-pusing
+terputus-putus
+tersanga-sanga
+tersaruk-saruk
+tersedan-sedan
+tersedih-sedih
+tersedu-sedu
+tersendat-sendat
+tersendeng-sendeng
+tersengal-sengal
+tersengguk-sengguk
+tersengut-sengut
+tersera-sera
+terserak-serak
+tersetai-setai
+tersia-sia
+tersipu-sipu
+tersoja-soja
+tersungkuk-sungkuk
+tertagak-tagak
+tertahan-tahan
+tertatih-tatih
+tertegun-tegun
+tertekan-tekan
+terteleng-teleng
+terumbang-ambing
+terumbang-umbang
+terungkap-ungkap
+terus-menerus
+terus-terusan
+think-tank
+tiap-tiap
+tiba-tiba
+tidak-tidak
+tidur-tidur
+tie-dye
+tiga-tiganya
+tikam-menikam
+tilik-menilik
+timah-timah
+timang-timangan
+timbang-menimbang
+timu-timu
+tindih-bertindih
+tinjau-meninjau
+tip-off
+tiru-tiruan
+tiup-tiup
+tokak-takik
+tokok-menokok
+tolak-menolak
+tolong-menolong
+top-level
+trade-in
+tua-tua
+tuan-tuan
+tuang-tuang
+tuban-tuban
+tukang-menukang
+tukar-menukar
+tulang-tulangan
+tuli-tuli
+tulis-menulis
+tumbuh-tumbuhan
+tune-up
+tunggang-tunggit
+tupai-tupai
+turun-temurun
+turut-menurut
+turut-turutan
+two-tone
+uar-uar
+ubel-ubel
+ubun-ubun
+ubur-ubur
+uci-uci
+udap-udapan
+ugal-ugalan
+uir-uir
+ujar-ujar
+ukir-mengukir
+ula-ula
+ulak-ulak
+ulang-alik
+ulang-aling
+ulang-ulang
+ulap-ulap
+ular-ular
+ular-ularan
+ulung-ulung
+umang-umang
+umbang-ambing
+umbi-umbian
+umbul-umbul
+umbut-umbut
+uncang-uncit
+undak-undakan
+undang-undang
+unduk-unduk
+undung-undung
+undur-undur
+unggat-unggit
+ungkit-ungkit
+unting-unting
+untung-untung
+untung-untungan
+upside-down
+ura-ura
+uran-uran
+urat-urat
+uring-uringan
+urup-urup
+urup-urupan
+urus-urus
+user-user
+user-useran
+utar-utar
+voice-over
+walk-out
+wangi-wangian
+wanti-wanti
+wara-wara
+warna-warni
+water-cooled
+world-class
+yang-yang
+""".split()
+)
diff --git a/spacy/lang/ms/examples.py b/spacy/lang/ms/examples.py
new file mode 100644
index 000000000..97ab19b6e
--- /dev/null
+++ b/spacy/lang/ms/examples.py
@@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ms.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+ "Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
+ "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
+ "Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.",
+ "Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir",
+ "Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?",
+ "Siapa yang akan memimpin projek itu?",
+ "Siapa perdana menteri Malaysia sekarang?",
+]
diff --git a/spacy/lang/ms/lex_attrs.py b/spacy/lang/ms/lex_attrs.py
new file mode 100644
index 000000000..42759fa4f
--- /dev/null
+++ b/spacy/lang/ms/lex_attrs.py
@@ -0,0 +1,66 @@
+import unicodedata
+
+from .punctuation import LIST_CURRENCY
+from ...attrs import IS_CURRENCY, LIKE_NUM
+
+
+_num_words = [
+ "kosong",
+ "satu",
+ "dua",
+ "tiga",
+ "empat",
+ "lima",
+ "enam",
+ "tujuh",
+ "lapan",
+ "sembilan",
+ "sepuluh",
+ "sebelas",
+ "belas",
+ "puluh",
+ "ratus",
+ "ribu",
+ "juta",
+ "billion",
+ "trillion",
+ "kuadrilion",
+ "kuintilion",
+ "sekstilion",
+ "septilion",
+ "oktilion",
+ "nonilion",
+ "desilion",
+]
+
+
+def like_num(text):
+ if text.startswith(("+", "-", "±", "~")):
+ text = text[1:]
+ text = text.replace(",", "").replace(".", "")
+ if text.isdigit():
+ return True
+ if text.count("/") == 1:
+ num, denom = text.split("/")
+ if num.isdigit() and denom.isdigit():
+ return True
+ if text.lower() in _num_words:
+ return True
+ if text.count("-") == 1:
+ _, num = text.split("-")
+ if num.isdigit() or num in _num_words:
+ return True
+ return False
+
+
+def is_currency(text):
+ if text in LIST_CURRENCY:
+ return True
+
+ for char in text:
+ if unicodedata.category(char) != "Sc":
+ return False
+ return True
+
+
+LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}
diff --git a/spacy/lang/ms/punctuation.py b/spacy/lang/ms/punctuation.py
new file mode 100644
index 000000000..9fff72576
--- /dev/null
+++ b/spacy/lang/ms/punctuation.py
@@ -0,0 +1,61 @@
+from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units
+
+
+_units = (
+ _units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
+ "Hz kHz MHz GHz mAh "
+ "ratus rb ribu ribuan "
+ "juta jt jutaan mill?iar million bil[l]?iun bilyun billion "
+)
+_currency = _currency + r" USD RM MYR Rp IDR RMB SGD S\$"
+_months = (
+ "Januari Februari Mac April Mei Jun Julai Ogos September "
+ "Oktober November Disember Januari Februari Mac Mei Jun "
+ "Julai Ogos Oktober Disember Jan Feb Mac Jun Julai Ogos Sept "
+ "Okt Nov Dis"
+)
+
+
+UNITS = merge_chars(_units)
+CURRENCY = merge_chars(_currency)
+HTML_PREFIX = r"<(b|strong|i|em|p|span|div|br)\s?/>|]+)>"
+HTML_SUFFIX = r"(b|strong|i|em|p|span|div|a)>"
+MONTHS = merge_chars(_months)
+LIST_CURRENCY = split_chars(_currency)
+
+_prefixes = list(TOKENIZER_PREFIXES)
+_prefixes.remove("#") # hashtag
+_prefixes = _prefixes + LIST_CURRENCY + [HTML_PREFIX] + ["/", "—"]
+
+_suffixes = (
+ TOKENIZER_SUFFIXES
+ + [r"\-[Nn]ya", "-[KkMm]u", "[—-]"]
+ + [
+ # disabled: variable width currency variable
+ # r"(?<={c})(?:[0-9]+)".format(c=CURRENCY),
+ r"(?<=[0-9])(?:{u})".format(u=UNITS),
+ r"(?<=[0-9])%",
+ # disabled: variable width HTML_SUFFIX variable
+ # r"(?<=[0-9{a}]{h})(?:[\.,:-])".format(a=ALPHA, h=HTML_SUFFIX),
+ r"(?<=[0-9{a}])(?:{h})".format(a=ALPHA, h=HTML_SUFFIX),
+ ]
+)
+
+_infixes = TOKENIZER_INFIXES + [
+ r"(?<=[0-9])[\\/](?=[0-9%-])",
+ r"(?<=[0-9])%(?=[{a}0-9/])".format(a=ALPHA),
+ # disabled: variable width units variable
+ # r"(?<={u})[\/-](?=[0-9])".format(u=UNITS),
+ # disabled: variable width months variable
+ # r"(?<={m})[\/-](?=[0-9])".format(m=MONTHS),
+ r'(?<=[0-9)][.,])"(?=[0-9])',
+ r'(?<=[{a})][.,\'])["—](?=[{a}])'.format(a=ALPHA),
+ r"(?<=[{a}])-(?=[0-9])".format(a=ALPHA),
+ r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}])[\/-](?={c}|[{a}])".format(a=ALPHA, c=CURRENCY),
+]
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/ms/stop_words.py b/spacy/lang/ms/stop_words.py
new file mode 100644
index 000000000..b1bfaea79
--- /dev/null
+++ b/spacy/lang/ms/stop_words.py
@@ -0,0 +1,118 @@
+STOP_WORDS = set(
+ """
+ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
+aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
+apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
+awalnya
+
+bagai bagaikan bagaimana bagaimanakah bagaimanapun bagi bagian bahkan bahwa
+bahwasanya baik bakal bakalan balik banyak bapak baru bawah beberapa begini
+beginian beginikah beginilah begitu begitukah begitulah begitupun bekerja
+belakang belakangan belum belumlah benar benarkah benarlah berada berakhir
+berakhirlah berakhirnya berapa berapakah berapalah berapapun berarti berawal
+berbagai berdatangan beri berikan berikut berikutnya berjumlah berkali-kali
+berkata berkehendak berkeinginan berkenaan berlainan berlalu berlangsung
+berlebihan bermacam bermacam-macam bermaksud bermula bersama bersama-sama
+bersiap bersiap-siap bertanya bertanya-tanya berturut berturut-turut bertutur
+berujar berupa besar betul betulkah biasa biasanya bila bilakah bisa bisakah
+boleh bolehkah bolehlah buat bukan bukankah bukanlah bukannya bulan bung
+
+cara caranya cukup cukupkah cukuplah cuma
+
+dahulu dalam dan dapat dari daripada datang dekat demi demikian demikianlah
+dengan depan di dia diakhiri diakhirinya dialah diantara diantaranya diberi
+diberikan diberikannya dibuat dibuatnya didapat didatangkan digunakan
+diibaratkan diibaratkannya diingat diingatkan diinginkan dijawab dijelaskan
+dijelaskannya dikarenakan dikatakan dikatakannya dikerjakan diketahui
+diketahuinya dikira dilakukan dilalui dilihat dimaksud dimaksudkan
+dimaksudkannya dimaksudnya diminta dimintai dimisalkan dimulai dimulailah
+dimulainya dimungkinkan dini dipastikan diperbuat diperbuatnya dipergunakan
+diperkirakan diperlihatkan diperlukan diperlukannya dipersoalkan dipertanyakan
+dipunyai diri dirinya disampaikan disebut disebutkan disebutkannya disini
+disinilah ditambahkan ditandaskan ditanya ditanyai ditanyakan ditegaskan
+ditujukan ditunjuk ditunjuki ditunjukkan ditunjukkannya ditunjuknya dituturkan
+dituturkannya diucapkan diucapkannya diungkapkan dong dua dulu
+
+empat enggak enggaknya entah entahlah
+
+guna gunakan
+
+hal hampir hanya hanyalah hari harus haruslah harusnya hendak hendaklah
+hendaknya hingga
+
+ia ialah ibarat ibaratkan ibaratnya ibu ikut ingat ingat-ingat ingin inginkah
+inginkan ini inikah inilah itu itukah itulah
+
+jadi jadilah jadinya jangan jangankan janganlah jauh jawab jawaban jawabnya
+jelas jelaskan jelaslah jelasnya jika jikalau juga jumlah jumlahnya justru
+
+kala kalau kalaulah kalaupun kalian kami kamilah kamu kamulah kan kapan
+kapankah kapanpun karena karenanya kasus kata katakan katakanlah katanya ke
+keadaan kebetulan kecil kedua keduanya keinginan kelamaan kelihatan
+kelihatannya kelima keluar kembali kemudian kemungkinan kemungkinannya kenapa
+kepada kepadanya kesampaian keseluruhan keseluruhannya keterlaluan ketika
+khususnya kini kinilah kira kira-kira kiranya kita kitalah kok kurang
+
+lagi lagian lah lain lainnya lalu lama lamanya lanjut lanjutnya lebih lewat
+lima luar
+
+macam maka makanya makin malah malahan mampu mampukah mana manakala manalagi
+masa masalah masalahnya masih masihkah masing masing-masing mau maupun
+melainkan melakukan melalui melihat melihatnya memang memastikan memberi
+memberikan membuat memerlukan memihak meminta memintakan memisalkan memperbuat
+mempergunakan memperkirakan memperlihatkan mempersiapkan mempersoalkan
+mempertanyakan mempunyai memulai memungkinkan menaiki menambahkan menandaskan
+menanti menanti-nanti menantikan menanya menanyai menanyakan mendapat
+mendapatkan mendatang mendatangi mendatangkan menegaskan mengakhiri mengapa
+mengatakan mengatakannya mengenai mengerjakan mengetahui menggunakan
+menghendaki mengibaratkan mengibaratkannya mengingat mengingatkan menginginkan
+mengira mengucapkan mengucapkannya mengungkapkan menjadi menjawab menjelaskan
+menuju menunjuk menunjuki menunjukkan menunjuknya menurut menuturkan
+menyampaikan menyangkut menyatakan menyebutkan menyeluruh menyiapkan merasa
+mereka merekalah merupakan meski meskipun meyakini meyakinkan minta mirip
+misal misalkan misalnya mula mulai mulailah mulanya mungkin mungkinkah
+
+nah naik namun nanti nantinya nyaris nyatanya
+
+oleh olehnya
+
+pada padahal padanya pak paling panjang pantas para pasti pastilah penting
+pentingnya per percuma perlu perlukah perlunya pernah persoalan pertama
+pertama-tama pertanyaan pertanyakan pihak pihaknya pukul pula pun punya
+
+rasa rasanya rata rupanya
+
+saat saatnya saja sajalah saling sama sama-sama sambil sampai sampai-sampai
+sampaikan sana sangat sangatlah satu saya sayalah se sebab sebabnya sebagai
+sebagaimana sebagainya sebagian sebaik sebaik-baiknya sebaiknya sebaliknya
+sebanyak sebegini sebegitu sebelum sebelumnya sebenarnya seberapa sebesar
+sebetulnya sebisanya sebuah sebut sebutlah sebutnya secara secukupnya sedang
+sedangkan sedemikian sedikit sedikitnya seenaknya segala segalanya segera
+seharusnya sehingga seingat sejak sejauh sejenak sejumlah sekadar sekadarnya
+sekali sekali-kali sekalian sekaligus sekalipun sekarang sekarang sekecil
+seketika sekiranya sekitar sekitarnya sekurang-kurangnya sekurangnya sela
+selain selaku selalu selama selama-lamanya selamanya selanjutnya seluruh
+seluruhnya semacam semakin semampu semampunya semasa semasih semata semata-mata
+semaunya sementara semisal semisalnya sempat semua semuanya semula sendiri
+sendirian sendirinya seolah seolah-olah seorang sepanjang sepantasnya
+sepantasnyalah seperlunya seperti sepertinya sepihak sering seringnya serta
+serupa sesaat sesama sesampai sesegera sesekali seseorang sesuatu sesuatunya
+sesudah sesudahnya setelah setempat setengah seterusnya setiap setiba setibanya
+setidak-tidaknya setidaknya setinggi seusai sewaktu siap siapa siapakah
+siapapun sini sinilah soal soalnya suatu sudah sudahkah sudahlah supaya
+
+tadi tadinya tahu tahun tak tambah tambahnya tampak tampaknya tandas tandasnya
+tanpa tanya tanyakan tanyanya tapi tegas tegasnya telah tempat tengah tentang
+tentu tentulah tentunya tepat terakhir terasa terbanyak terdahulu terdapat
+terdiri terhadap terhadapnya teringat teringat-ingat terjadi terjadilah
+terjadinya terkira terlalu terlebih terlihat termasuk ternyata tersampaikan
+tersebut tersebutlah tertentu tertuju terus terutama tetap tetapi tiap tiba
+tiba-tiba tidak tidakkah tidaklah tiga tinggi toh tunjuk turut tutur tuturnya
+
+ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
+
+waduh wah wahai waktu waktunya walau walaupun wong
+
+yaitu yakin yakni yang
+""".split()
+)
diff --git a/spacy/lang/ms/syntax_iterators.py b/spacy/lang/ms/syntax_iterators.py
new file mode 100644
index 000000000..fa984d411
--- /dev/null
+++ b/spacy/lang/ms/syntax_iterators.py
@@ -0,0 +1,41 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ """
+ Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+ """
+ # fmt: off
+ labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+ # fmt: on
+ doc = doclike.doc # Ensure works on both Doc and Span.
+ if not doc.has_annotation("DEP"):
+ raise ValueError(Errors.E029)
+ np_deps = [doc.vocab.strings[label] for label in labels]
+ conj = doc.vocab.strings.add("conj")
+ np_label = doc.vocab.strings.add("NP")
+ prev_end = -1
+ for i, word in enumerate(doclike):
+ if word.pos not in (NOUN, PROPN, PRON):
+ continue
+ # Prevent nested chunks from being produced
+ if word.left_edge.i <= prev_end:
+ continue
+ if word.dep in np_deps:
+ prev_end = word.right_edge.i
+ yield word.left_edge.i, word.right_edge.i + 1, np_label
+ elif word.dep == conj:
+ head = word.head
+ while head.dep == conj and head.head.i < head.i:
+ head = head.head
+ # If the head is an NP, and we're coordinated to it, we're an NP
+ if head.dep in np_deps:
+ prev_end = word.right_edge.i
+ yield word.left_edge.i, word.right_edge.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ms/tokenizer_exceptions.py b/spacy/lang/ms/tokenizer_exceptions.py
new file mode 100644
index 000000000..6b6cf3b15
--- /dev/null
+++ b/spacy/lang/ms/tokenizer_exceptions.py
@@ -0,0 +1,1533 @@
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ._tokenizer_exceptions_list import MS_BASE_EXCEPTIONS
+from ...symbols import ORTH, NORM
+from ...util import update_exc
+
+
+# Daftar singkatan dan Akronim dari:
+# https://ms.wiktionary.org/wiki/Wiktionary:Senarai_akronim_dan_singkatan
+
+_exc = {}
+
+for orth in MS_BASE_EXCEPTIONS:
+ _exc[orth] = [{ORTH: orth}]
+ orth_title = orth.title()
+ _exc[orth_title] = [{ORTH: orth_title}]
+ orth_caps = orth.upper()
+ _exc[orth_caps] = [{ORTH: orth_caps}]
+ orth_lower = orth.lower()
+ _exc[orth_lower] = [{ORTH: orth_lower}]
+ orth_first_upper = orth[0].upper() + orth[1:]
+ _exc[orth_first_upper] = [{ORTH: orth_first_upper}]
+ if "-" in orth:
+ orth_title = "-".join([part.title() for part in orth.split("-")])
+ _exc[orth_title] = [{ORTH: orth_title}]
+ orth_caps = "-".join([part.upper() for part in orth.split("-")])
+ _exc[orth_caps] = [{ORTH: orth_caps}]
+
+for exc_data in [
+ {ORTH: "Jan.", NORM: "Januari"},
+ {ORTH: "Feb.", NORM: "Februari"},
+ {ORTH: "Mac.", NORM: "Mac"},
+ {ORTH: "Apr.", NORM: "April"},
+ {ORTH: "Jun.", NORM: "Jun"},
+ {ORTH: "Jul.", NORM: "Julai"},
+ {ORTH: "Ogos.", NORM: "Ogos"},
+ {ORTH: "Sep.", NORM: "September"},
+ {ORTH: "Okt.", NORM: "Oktober"},
+ {ORTH: "Nov.", NORM: "November"},
+ {ORTH: "Dis.", NORM: "Disember"},
+]:
+ _exc[exc_data[ORTH]] = [exc_data]
+
+_other_exc = {
+ "do'a": [{ORTH: "do'a", NORM: "doa"}],
+ "jum'at": [{ORTH: "jum'at", NORM: "Jumat"}],
+ "Jum'at": [{ORTH: "Jum'at", NORM: "Jumat"}],
+ "la'nat": [{ORTH: "la'nat", NORM: "laknat"}],
+ "ma'af": [{ORTH: "ma'af", NORM: "maaf"}],
+ "mu'jizat": [{ORTH: "mu'jizat", NORM: "mukjizat"}],
+ "Mu'jizat": [{ORTH: "Mu'jizat", NORM: "mukjizat"}],
+ "ni'mat": [{ORTH: "ni'mat", NORM: "nikmat"}],
+ "raka'at": [{ORTH: "raka'at", NORM: "rakaat"}],
+ "ta'at": [{ORTH: "ta'at", NORM: "taat"}],
+}
+
+_exc.update(_other_exc)
+
+for orth in [
+ "1 Kor.",
+ "1 Ptr.",
+ "1 Raj.",
+ "1 Sam.",
+ "1 Taw.",
+ "1 Tes.",
+ "1 Tim.",
+ "1 Yoh.",
+ "1Ch.",
+ "1Co.",
+ "1Jo.",
+ "1Ki.",
+ "1Pe.",
+ "1Sa.",
+ "1Th.",
+ "1Ti.",
+ "2 Kor.",
+ "2 Ptr.",
+ "2 Raj.",
+ "2 Sam.",
+ "2 Taw.",
+ "2 Tes.",
+ "2 Tim.",
+ "2 Yoh.",
+ "2Ch.",
+ "2Co.",
+ "2Jo.",
+ "2Ki.",
+ "2Pe.",
+ "2Sa.",
+ "2Th.",
+ "2Ti.",
+ "3 Yoh.",
+ "3D",
+ "3F",
+ "3Jo.",
+ "3M",
+ "8MP",
+ "AA",
+ "AAAAAA",
+ "AB",
+ "Abd.",
+ "ABC",
+ "ABIM",
+ "ABM",
+ "ABMI",
+ "ABS",
+ "AC",
+ "Ac",
+ "ACAPLPL",
+ "Act.",
+ "AD",
+ "AD LIB",
+ "ADAM",
+ "ADB",
+ "ADD",
+ "ADIL",
+ "ADN",
+ "ADR",
+ "ADRI",
+ "ADSL",
+ "ADUN",
+ "AFAS",
+ "AFTA",
+ "Ag",
+ "AGMARIS",
+ "AH",
+ "AI",
+ "AIA",
+ "AIDS",
+ "AIJV",
+ "AIM",
+ "a/k",
+ "ak",
+ "AKN",
+ "Al",
+ "a/l",
+ "AM",
+ "Am",
+ "Am.",
+ "AMN",
+ "Amo.",
+ "AMPS",
+ "Ams.",
+ "AMWA",
+ "AN",
+ "a.n.",
+ "ANGKASA",
+ "ANM",
+ "ANSI",
+ "Ant.",
+ "AOL",
+ "AP",
+ "a/p",
+ "APD",
+ "APEC",
+ "API",
+ "APIK",
+ "APM",
+ "APN",
+ "APP",
+ "Apr.",
+ "APRI",
+ "Ar",
+ "Ar.",
+ "ark.",
+ "A.S.",
+ "As",
+ "a.s.",
+ "ASA",
+ "ASAS 50",
+ "ASB",
+ "ASCII",
+ "ASEAN",
+ "ASEAN+3",
+ "ASEM",
+ "a.s.f.",
+ "ASN",
+ "a.s.o.",
+ "ASP",
+ "Ast.",
+ "A.T.",
+ "At",
+ "ATM",
+ "a.t.r.",
+ "ATUR",
+ "Au",
+ "AURI",
+ "Aug.",
+ "AWOL",
+ "Ayb.",
+ "B",
+ "BA",
+ "Ba",
+ "BAC",
+ "BAFIA",
+ "BAM",
+ "BANANA",
+ "BAPP",
+ "BASF",
+ "BATA",
+ "BB",
+ "BBC",
+ "BBE",
+ "BBS",
+ "BC",
+ "BCG",
+ "BCIC",
+ "b.d.",
+ "BDSSHAM",
+ "Be",
+ "BEER",
+ "BERNAMA",
+ "Bh",
+ "b.h.",
+ "Bhd.",
+ "Bi",
+ "BIDS",
+ "Bil.",
+ "bil.",
+ "BIMP-EAGA",
+ "Bio.",
+ "BIOS",
+ "BITMB",
+ "BJ",
+ "Bk",
+ "b.k.",
+ "BKAL",
+ "bkn.",
+ "BKP",
+ "BL",
+ "BLR",
+ "BM",
+ "BMI",
+ "BMW",
+ "BN",
+ "BNM",
+ "BO",
+ "BOJ",
+ "BOO",
+ "BOP",
+ "BOT",
+ "BP",
+ "b.p.",
+ "BPA",
+ "BPAs",
+ "bpd.",
+ "BPIMB",
+ "BPM",
+ "BPO",
+ "BPPH",
+ "Br",
+ "Br.",
+ "BSA",
+ "B.Sc.",
+ "B.Sh.",
+ "b.s.j.",
+ "BSN",
+ "Bt.",
+ "bt.",
+ "BWT",
+ "BYOB",
+ "C",
+ "C.",
+ "C/E",
+ "Ca",
+ "CAAM",
+ "CAD",
+ "CAM",
+ "CATV",
+ "CBS",
+ "CBT",
+ "CC",
+ "CCD",
+ "CCM",
+ "CCR",
+ "cct-km",
+ "CCTV",
+ "CCU",
+ "CD",
+ "Cd",
+ "CD-ROM",
+ "CD-RW",
+ "CDRC",
+ "Ce",
+ "CEO",
+ "CEPT",
+ "Cetak",
+ "Cf",
+ "CFO",
+ "CFTC",
+ "CGC",
+ "CGI",
+ "CH",
+ "CIA",
+ "CIAST",
+ "CID",
+ "CIDB",
+ "CIQ",
+ "CKD",
+ "CL",
+ "Cl",
+ "c.l.",
+ "CLI",
+ "CLOB",
+ "CM",
+ "Cm",
+ "cm.",
+ "CMAG",
+ "CMI",
+ "CMP",
+ "CNN",
+ "Co",
+ "COD",
+ "Col.",
+ "COLA",
+ "COMDEX",
+ "CP",
+ "CPI",
+ "CPO",
+ "CPR",
+ "CPU",
+ "Cr",
+ "CRDF",
+ "Cs",
+ "CST",
+ "CT",
+ "CTIP",
+ "CTRM",
+ "Cu",
+ "CUEPACS",
+ "D-8",
+ "d/a",
+ "DAGS",
+ "Dan.",
+ "DANCED",
+ "DAP",
+ "DARA",
+ "Db",
+ "DBKL",
+ "DBP",
+ "DBR",
+ "DC",
+ "DDA",
+ "DDT",
+ "DEB",
+ "Dec.",
+ "Deu.",
+ "DFIs",
+ "dgn.",
+ "DHL",
+ "DIBML",
+ "DIN",
+ "Dis.",
+ "DJ",
+ "d.l.l.",
+ "dlm.",
+ "dng.",
+ "DNS",
+ "DO",
+ "DOA",
+ "DOE",
+ "DOF",
+ "DOSH",
+ "doz.",
+ "DPPS",
+ "Dr.",
+ "dr.",
+ "drp.",
+ "drpd.",
+ "Ds",
+ "d.sb.",
+ "d.st.",
+ "DSTN2",
+ "Dt.",
+ "DTAs",
+ "DTMF",
+ "DTP",
+ "DTV",
+ "DUBES",
+ "DUNHILL",
+ "DV8",
+ "DVD",
+ "DVE",
+ "DVS",
+ "dw.t.",
+ "Dy",
+ "DYMM",
+ "E",
+ "E-Commerce",
+ "E-Dagang",
+ "E&E",
+ "E-Faraid",
+ "E-Government",
+ "E-Kerajaan",
+ "E-Mail",
+ "E-Services",
+ "E-Village",
+ "E-Zine",
+ "EALAF",
+ "EBI",
+ "EBP",
+ "EC",
+ "ECAFE",
+ "Ecc.",
+ "ECI",
+ "ECM",
+ "ECOSOC",
+ "ECP",
+ "ECR",
+ "EDI",
+ "EE",
+ "EEC",
+ "Ef.",
+ "EG",
+ "Eko.",
+ "EKS",
+ "ELWS",
+ "ELX",
+ "EMI",
+ "EMUs",
+ "En.",
+ "EP",
+ "EPF",
+ "Eph.",
+ "EPP",
+ "EPS",
+ "EPU",
+ "ER",
+ "Er",
+ "ERL",
+ "ERT",
+ "Es",
+ "ESCAP",
+ "ESOS",
+ "ESP",
+ "EST",
+ "Est.",
+ "ET",
+ "ETA",
+ "ETACS",
+ "ETC",
+ "ETD",
+ "EU",
+ "Eu",
+ "EVIAN",
+ "Exim Bank",
+ "Exo.",
+ "Eze.",
+ "Ezr.",
+ "F",
+ "FAM",
+ "FAMA",
+ "FAO",
+ "FAQ",
+ "FAX",
+ "FBI",
+ "FC",
+ "FCA",
+ "FCC",
+ "FDI",
+ "FE",
+ "Fe",
+ "f.e.",
+ "Feb.",
+ "FELCRA",
+ "FELDA",
+ "FI",
+ "FIA 1993",
+ "FIAT",
+ "FIC",
+ "FIDA",
+ "FIFA",
+ "FIMA",
+ "Fiz.",
+ "Flm.",
+ "Flp.",
+ "FM",
+ "Fm",
+ "FMUTM",
+ "FO",
+ "FOA",
+ "FOB",
+ "FOC",
+ "FOMCA",
+ "FORD",
+ "Fr",
+ "FRIM",
+ "FRTI",
+ "FSMP",
+ "FTA",
+ "FTE",
+ "FTP",
+ "G",
+ "g.",
+ "G15",
+ "G77",
+ "Ga",
+ "GAC",
+ "GACM",
+ "Gal.",
+ "GAPENA",
+ "GATS",
+ "GATT",
+ "GB",
+ "Gbps.",
+ "Gd",
+ "GDP",
+ "Ge",
+ "GEC",
+ "Gen.",
+ "Geo.",
+ "Geog.",
+ "Gerakan",
+ "GH",
+ "GIF",
+ "GII",
+ "GIS",
+ "GITIC",
+ "GITN",
+ "GJ",
+ "GLCs",
+ "GM",
+ "GMBH",
+ "GMI",
+ "GMT",
+ "GNP",
+ "GNS",
+ "GOLD",
+ "GP",
+ "GPC",
+ "GPIM",
+ "GPMS",
+ "GPO",
+ "GPP",
+ "GPS",
+ "GRO",
+ "GRS",
+ "GSMC",
+ "GST",
+ "GTZ",
+ "GUI",
+ "GWh.",
+ "H",
+ "Ha",
+ "Hab.",
+ "Hag.",
+ "Hak.",
+ "ham",
+ "hb.",
+ "HCI",
+ "HDTV",
+ "He",
+ "Heb.",
+ "Hf",
+ "Hg",
+ "HI-FI",
+ "HIS",
+ "HIV",
+ "Hj.",
+ "HMS",
+ "Ho",
+ "Hos.",
+ "HP",
+ "HRDC",
+ "HRDF",
+ "HRMIS",
+ "Hs",
+ "Hut.",
+ "I",
+ "I/O",
+ "IA",
+ "IAA",
+ "IADPs",
+ "IB",
+ "i.b.",
+ "IBA",
+ "IBFIM",
+ "IBG",
+ "Ibr.",
+ "IBRD",
+ "IBS",
+ "IC",
+ "ICA",
+ "ICBM",
+ "ICFM",
+ "ICI",
+ "ICM",
+ "ICOR",
+ "ICP",
+ "ICT",
+ "ICU",
+ "ID",
+ "Id.",
+ "IDB",
+ "IDFR",
+ "IE",
+ "i.e.",
+ "IFSB",
+ "IGAs",
+ "IGS",
+ "IHP",
+ "IHPG",
+ "IIM",
+ "IINA",
+ "IKKL",
+ "IKP",
+ "IKPH",
+ "IKS",
+ "Im.",
+ "IMD",
+ "IMF",
+ "IMP2",
+ "IMR",
+ "IMS-GT",
+ "IMT-GT",
+ "In",
+ "in.",
+ "INFRA",
+ "INSEP",
+ "INSPEN",
+ "INTAN",
+ "IOFC",
+ "IOU",
+ "IP",
+ "IPA",
+ "IPBA",
+ "IPCs",
+ "IPEBP",
+ "IPI",
+ "IPKIM",
+ "IPKPM",
+ "IPO",
+ "IPP",
+ "IPPM",
+ "IPPPM",
+ "i.pt.",
+ "IPTAR",
+ "IPTNM",
+ "IQR",
+ "Ir",
+ "IRA",
+ "IRPA",
+ "IRS",
+ "i.s.",
+ "ISA",
+ "Isa.",
+ "ISDN",
+ "ISMM",
+ "ISO",
+ "ISP",
+ "ist.",
+ "IT",
+ "i.t.",
+ "ITA",
+ "ITAF",
+ "ITEX",
+ "ITK",
+ "ITM",
+ "ITO",
+ "ITRCo",
+ "ITTA",
+ "ITU",
+ "JAK",
+ "JAKIM",
+ "Jam.",
+ "Jan.",
+ "Jb.",
+ "JBIC",
+ "JD",
+ "JDA",
+ "Jdg.",
+ "Jer.",
+ "Jh.",
+ "JICA",
+ "JJ",
+ "Jk.",
+ "JKKK",
+ "jkps.",
+ "JKR",
+ "JMTI",
+ "JOA",
+ "Joe.",
+ "Joh.",
+ "Jon.",
+ "Jos.",
+ "JP",
+ "JPA",
+ "JPEG",
+ "JPH",
+ "JPJ",
+ "JPSHK",
+ "JPS",
+ "JPT",
+ "JRDA",
+ "JSM",
+ "JT",
+ "Jud.",
+ "Jul.",
+ "Jun.",
+ "JVC",
+ "Jw.",
+ "K",
+ "K-Economy",
+ "KADA",
+ "KBE",
+ "KBIA",
+ "KBPA",
+ "KBSM",
+ "KD",
+ "Kd.",
+ "KDI",
+ "KDN",
+ "KDNK",
+ "KE",
+ "KEAP",
+ "Kej.",
+ "Kel.",
+ "KEM",
+ "KEMLU",
+ "kep.",
+ "Kg.",
+ "kg.",
+ "KGB",
+ "KGK",
+ "KH",
+ "ki.",
+ "Kid.",
+ "KIK",
+ "KIKMTT",
+ "KIM",
+ "Kim.",
+ "Kis.",
+ "KIX",
+ "KKGSK",
+ "KKK",
+ "KKPPA",
+ "KL",
+ "Kl.",
+ "KLCI",
+ "KLIA",
+ "KLIBOR",
+ "KLIM",
+ "KLM",
+ "KLSE",
+ "KM",
+ "KMM",
+ "KNK",
+ "KO",
+ "Kol.",
+ "Kom.",
+ "Komp.",
+ "KOMSAS",
+ "KPAI",
+ "KPB",
+ "KPBA",
+ "KPC",
+ "kpd.",
+ "KPE",
+ "KPIs",
+ "KPPL",
+ "KPPMS",
+ "KPWM",
+ "Kr",
+ "KRM",
+ "KSTI",
+ "KT",
+ "KTA",
+ "KTABKL",
+ "KTM",
+ "KTMB",
+ "kV",
+ "kW",
+ "kWh",
+ "kWj",
+ "KWSP",
+ "LA",
+ "La",
+ "LABOR",
+ "Lam.",
+ "LAN",
+ "LAPD",
+ "LASER",
+ "LAX",
+ "lb.",
+ "LC",
+ "LCD",
+ "LCHRF",
+ "LCLY",
+ "LED",
+ "Lev.",
+ "LFPR",
+ "LFS",
+ "LFX",
+ "LGM",
+ "Li",
+ "LID",
+ "Lin.",
+ "LKN",
+ "LKPM",
+ "LKPP",
+ "LKTP",
+ "LKWJ",
+ "LLB",
+ "LLC",
+ "LLN",
+ "LLS",
+ "LMSM",
+ "LNG",
+ "LOA",
+ "LOBATA",
+ "LOFSA",
+ "LPG",
+ "LPIP",
+ "LPKI",
+ "LPKLPL",
+ "LPKN",
+ "LPN",
+ "LPP",
+ "LPPK",
+ "LPPM",
+ "LPPP",
+ "LPPTP",
+ "Lr",
+ "LRs",
+ "LRT",
+ "LS",
+ "LTAKL",
+ "LTD",
+ "LTK",
+ "Lu",
+ "LUAS",
+ "Luk.",
+ "lw.",
+ "lwn.",
+ "M\n",
+ "m",
+ "M&A",
+ "MAB",
+ "MACRES",
+ "MAD",
+ "MADA",
+ "MAGERAN",
+ "MAHA",
+ "MAHSURI",
+ "Mal.",
+ "MALINDO",
+ "MAMPU",
+ "Mar.",
+ "MARA",
+ "MARC",
+ "MARDI",
+ "MARLBORO",
+ "MAS",
+ "MASSA",
+ "MASSCORP",
+ "Mat.",
+ "MATRADE",
+ "MAVCAP",
+ "MB",
+ "MBA",
+ "MBBS",
+ "MBM",
+ "MBO",
+ "MBS",
+ "MBTU",
+ "MC",
+ "MCA",
+ "MCB",
+ "MCSL",
+ "MCSv5",
+ "MD",
+ "Md",
+ "MDB",
+ "MDC",
+ "MDG",
+ "MDV",
+ "MEASAT",
+ "MEATJ",
+ "MECIB",
+ "MEMO",
+ "MENLU",
+ "MEPS",
+ "MES",
+ "MESDAQ",
+ "METEOR",
+ "MFI",
+ "MFIs",
+ "MG",
+ "Mg",
+ "MGM",
+ "MGR",
+ "MGS",
+ "MHA",
+ "Mi.",
+ "MIA",
+ "MIB",
+ "MIC",
+ "Mic.",
+ "MICE",
+ "MIDA",
+ "MIDF",
+ "MIDI",
+ "MIG",
+ "MIGHT",
+ "MII",
+ "MIMOS",
+ "MINDEF",
+ "MINT",
+ "mis.",
+ "MIT",
+ "MITC",
+ "MITI",
+ "Ml.",
+ "MLNG",
+ "mlpd.",
+ "MM",
+ "mm",
+ "MMN",
+ "mmscfd.",
+ "MMU",
+ "MMX",
+ "Mn",
+ "Mn.",
+ "MNA",
+ "MNCs",
+ "MO",
+ "Mo",
+ "MOA",
+ "MOD",
+ "MODEM",
+ "MOE",
+ "MOH",
+ "MOSTE",
+ "MOSTI",
+ "MOU",
+ "MP",
+ "MPB",
+ "MPEG",
+ "MPOB",
+ "MPP",
+ "mppa.",
+ "MPPJ",
+ "MPS",
+ "MPTM",
+ "MR",
+ "m.r.",
+ "MRB",
+ "MRELB",
+ "Mrk.",
+ "MRRDB",
+ "MS",
+ "MS-DOS",
+ "MSC",
+ "MSG",
+ "MSM",
+ "Mt",
+ "MTC",
+ "MTCP",
+ "MTD",
+ "MTDC",
+ "MTPB",
+ "MTV",
+ "Muz.",
+ "MV",
+ "MW",
+ "MY",
+ "MyKe",
+ "Mzm.",
+ "N",
+ "N/A",
+ "Na",
+ "NAB",
+ "NACIWID",
+ "Nah.",
+ "NAP",
+ "NASA",
+ "NATO",
+ "NAV",
+ "NB",
+ "Nb",
+ "NBA",
+ "NBC",
+ "NCR",
+ "Nd",
+ "NDP",
+ "Ne",
+ "NEAC",
+ "NEC",
+ "NEF",
+ "Neh.",
+ "NEP",
+ "NEqO",
+ "NERP",
+ "NF",
+ "NFPEs",
+ "NG",
+ "NGOs",
+ "NGV",
+ "NHEF",
+ "NHHES",
+ "NHK",
+ "Ni",
+ "NIDC",
+ "NIH",
+ "NIP",
+ "NIPA",
+ "NIS",
+ "NISIR",
+ "NITA",
+ "NITC",
+ "NITP",
+ "NIV",
+ "NLAC",
+ "NMPBSP",
+ "NMU",
+ "No",
+ "No.",
+ "no.",
+ "NOSS",
+ "Nov.",
+ "Np",
+ "NPC",
+ "NPCS",
+ "NPL",
+ "NRCC",
+ "NRW",
+ "NS",
+ "Ns",
+ "NSB",
+ "NTA",
+ "NTHRDC",
+ "NTMP",
+ "NTSC",
+ "Num.",
+ "NUTF",
+ "NVP",
+ "NVTC",
+ "NWRC",
+ "O",
+ "Ob.",
+ "Oba.",
+ "OC",
+ "OCPD",
+ "Oct.",
+ "OD",
+ "ODA",
+ "OECD",
+ "OEM",
+ "Ogo.",
+ "OHQs",
+ "OIC",
+ "Okt.",
+ "OPEC",
+ "OPP",
+ "OPP3",
+ "OPR",
+ "OS",
+ "Os",
+ "OSA",
+ "OT",
+ "OUG",
+ "oz.",
+ "P",
+ "P&P",
+ "PA",
+ "Pa",
+ "PABK",
+ "PABX",
+ "PAK",
+ "PAKSI",
+ "PAL",
+ "PALL MALL",
+ "PAS",
+ "PATA",
+ "PAWS",
+ "Pb",
+ "PBA",
+ "PBB",
+ "PBM",
+ "PBP",
+ "PBSM",
+ "PBT",
+ "PC",
+ "PC(s)",
+ "PCB",
+ "PCIRITA",
+ "PCM",
+ "PCMCIA",
+ "PCN",
+ "PD",
+ "Pd",
+ "pd.",
+ "PDS",
+ "PE",
+ "PEKEMAS",
+ "PEMADAM",
+ "PENA",
+ "PENIS",
+ "PERDANA",
+ "PERKESO",
+ "PERKIM",
+ "PERNAS",
+ "PERTAMA",
+ "PERTIWI",
+ "PESAKA",
+ "PETA",
+ "PETRONAS",
+ "PGU",
+ "Ph.",
+ "PHD",
+ "Phi.",
+ "Phm.",
+ "PIK",
+ "PIKOM",
+ "PIN",
+ "PINTAS",
+ "PIPM",
+ "PISK",
+ "PITA",
+ "PIXEL",
+ "PJ",
+ "PJK",
+ "PJKB",
+ "PJP",
+ "PKBM",
+ "PKBTA",
+ "PKEN",
+ "Pkh.",
+ "PKKM",
+ "PKLPA",
+ "PKM",
+ "PKNS",
+ "PKPIM",
+ "PKPM",
+ "PKR",
+ "PKS",
+ "Pl.",
+ "p.l.",
+ "PLA",
+ "PLC",
+ "PLCHP",
+ "PLCs",
+ "PLI",
+ "PLT",
+ "PLUS",
+ "PLWS",
+ "PM",
+ "Pm",
+ "PMM",
+ "PMP",
+ "PMR",
+ "PMS",
+ "Pn.",
+ "PNAT",
+ "PNS",
+ "PO",
+ "Po",
+ "POCPA",
+ "POKEMON",
+ "Pol.",
+ "POP",
+ "PORIM",
+ "PORLA",
+ "PORTAFOAM",
+ "PP",
+ "PPA",
+ "PPBE",
+ "PPBK",
+ "ppd.",
+ "PPGM",
+ "PPI",
+ "PPK",
+ "PPL",
+ "PPM",
+ "PPP",
+ "PPPB",
+ "PPPLM",
+ "PPPM",
+ "PPR",
+ "PPRT",
+ "PPS",
+ "PPTM",
+ "PPU",
+ "PR",
+ "Pr",
+ "Pr.",
+ "prb.",
+ "PRI",
+ "PRO",
+ "Pro.",
+ "Prof.",
+ "PROSPER",
+ "PROSTAR",
+ "PROTON",
+ "PS",
+ "PSA",
+ "Psa.",
+ "PSCs",
+ "PSDC",
+ "PSDH",
+ "Psi.",
+ "PSKE",
+ "PSRM",
+ "PST",
+ "PT",
+ "Pt",
+ "PTD",
+ "PTP",
+ "Pu",
+ "PUNB",
+ "QA",
+ "QC",
+ "QCC",
+ "R&D",
+ "RA",
+ "Ra",
+ "RAM",
+ "RAPP",
+ "Rat.",
+ "Rb",
+ "RCA",
+ "RDA",
+ "RDAs",
+ "RDCs",
+ "RE",
+ "Re",
+ "REHDA",
+ "Rev.",
+ "Rf",
+ "Rg",
+ "RGB",
+ "Rh",
+ "RI",
+ "RIDA",
+ "RIP",
+ "RISDA",
+ "r.l.",
+ "RM",
+ "Rm.",
+ "RMKe-8",
+ "Rn",
+ "ROC",
+ "ROM",
+ "Rom.",
+ "RPG",
+ "RPS",
+ "RRI",
+ "RRIM",
+ "RRJP",
+ "RRP",
+ "RSGC",
+ "RSS",
+ "RSVP",
+ "Rt.",
+ "RTA",
+ "RTM",
+ "Ru",
+ "Rut.",
+ "RWCR",
+ "RX",
+ "S",
+ "S/N",
+ "S&T",
+ "S-VHS",
+ "SA",
+ "SAC",
+ "SADCs",
+ "SAGA",
+ "SALCRA",
+ "SALM",
+ "SALT",
+ "SAM",
+ "SAP",
+ "SARS",
+ "Sas.",
+ "s.a.w.",
+ "SB",
+ "Sb",
+ "Sb.",
+ "SBA",
+ "SBB",
+ "sbg.",
+ "SBK",
+ "SC",
+ "Sc",
+ "SCA",
+ "SCADA",
+ "SCANS",
+ "SCSI",
+ "SCuM",
+ "SDCs",
+ "Sdn. Bhd.",
+ "sdr.",
+ "SDRC",
+ "Se",
+ "SEATO",
+ "SEB",
+ "SECAM",
+ "SEDCs",
+ "SEFF",
+ "Sej.",
+ "SEMS",
+ "Sep.",
+ "Sept.",
+ "SESB",
+ "SESCo",
+ "s.f.",
+ "Sg",
+ "SGPCA",
+ "SGPPI",
+ "SGPPKRM",
+ "SGX",
+ "Si",
+ "Si.",
+ "SIA 1983",
+ "SIC",
+ "SIM",
+ "SING",
+ "SIRIM",
+ "SITTDEC",
+ "sj.",
+ "SKDTP",
+ "SKM",
+ "SKSM",
+ "SL",
+ "Sl.",
+ "sl.",
+ "SLMCH",
+ "SLR",
+ "SM",
+ "Sm",
+ "SMART",
+ "SMEs",
+ "SMEt",
+ "SMIs",
+ "SMIDEC",
+ "SMIDP",
+ "SMJK",
+ "SMR",
+ "SMS",
+ "SMT",
+ "SMTP",
+ "SN",
+ "Sn",
+ "SOB",
+ "SOCSO",
+ "SOHO",
+ "Son.",
+ "SOS",
+ "Sos.",
+ "SP",
+ "SPA",
+ "SPAM",
+ "SPCA",
+ "SPKR",
+ "SPLAM",
+ "SPM",
+ "SPNB",
+ "SPSP",
+ "t.",
+ "Ta",
+ "Tadb.",
+ "TAF",
+ "TAF-W",
+ "Tani",
+ "TAP",
+ "TAR",
+ "TARBI",
+ "TB",
+ "Tb",
+ "TBA",
+ "TBTP",
+ "Tc",
+ "TCPD",
+ "TDCs",
+ "Te",
+ "TEKUN",
+ "TELCO",
+ "TELEX",
+ "TEUs",
+ "TFP",
+ "TGV",
+ "TH",
+ "Th",
+ "THIS",
+ "Ti",
+ "TICAD",
+ "Tit.",
+ "TKA",
+ "Tks.",
+ "Tl",
+ "TLDM",
+ "TM",
+ "Tm",
+ "TMB",
+ "TMK",
+ "TNB",
+ "TNSB",
+ "TNT",
+ "TOEFL",
+ "TP",
+ "TPIM",
+ "TPK",
+ "TPPP",
+ "TPPT",
+ "TPSM",
+ "TPUB",
+ "TQM",
+ "Tr.",
+ "TRIPs",
+ "tsb.",
+ "tscf.",
+ "t.sh.",
+ "t.s.t.",
+ "TT",
+ "t.t.",
+ "TUDM",
+ "TV",
+ "TVSMR",
+ "TWAIN",
+ "TX",
+ "TYPHIrapid",
+ "U",
+ "Ubat",
+ "UDA",
+ "Udg.",
+ "UFO",
+ "UH",
+ "UIA",
+ "UiTM",
+ "UK",
+ "UKM",
+ "UL",
+ "Ul.",
+ "ULC",
+ "UM",
+ "UMNO",
+ "UMS",
+ "UN",
+ "UN/OSCAL",
+ "UNCLE",
+ "UNCTAD",
+ "UNDP",
+ "UNESCO",
+ "UNFCCC",
+ "UNFPA",
+ "UNHCR",
+ "UNICEF",
+ "UNIMAS",
+ "UNTAET",
+ "UPE",
+ "UPM",
+ "UPS",
+ "UPSR",
+ "URL",
+ "US",
+ "USAINS",
+ "USD",
+ "USM",
+ "USNO",
+ "USS",
+ "USSR",
+ "UTC",
+ "UTF",
+ "utk.",
+ "UTM",
+ "V",
+ "VAT",
+ "VCC",
+ "VCD",
+ "VCR",
+ "VD",
+ "VDSC",
+ "VGA",
+ "VHF",
+ "VHS",
+ "VIP",
+ "VMS",
+ "VO",
+ "VOA",
+ "VoIP",
+ "VR",
+ "VSOP",
+ "VW",
+ "W",
+ "W/O",
+ "WAP",
+ "WAY",
+ "WC",
+ "WDDM",
+ "WDM",
+ "WHO",
+ "Why.",
+ "WIM",
+ "WPG",
+ "WTO",
+ "WWF",
+ "WWW",
+ "WYSIWYG",
+ "Xe",
+ "XO",
+ "XXL",
+ "Y",
+ "Y2K",
+ "YAB",
+ "Yak.",
+ "YAM",
+ "YAS",
+ "YB",
+ "Yb",
+ "Yeh.",
+ "Yer.",
+ "Yes.",
+ "yg.",
+ "Yl.",
+ "YM",
+ "YMCA",
+ "Yoh.",
+ "Yos.",
+ "Y.Th.",
+ "YTM",
+ "Yud.",
+ "Yun.",
+ "Za.",
+ "Zec.",
+ "Zef.",
+ "Zep.",
+ "ZIP",
+ "Zn",
+ "Zr",
+]:
+ _exc[orth] = [{ORTH: orth}]
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/language.py b/spacy/language.py
index c4962ea45..6d867820a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,6 +1,6 @@
from typing import Iterator, Optional, Any, Dict, Callable, Iterable
from typing import Union, Tuple, List, Set, Pattern, Sequence, overload
-from typing import NoReturn, TYPE_CHECKING, TypeVar, cast
+from typing import NoReturn, TypeVar, cast
from dataclasses import dataclass
import random
@@ -1325,7 +1325,10 @@ class Language:
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
)
doc = Doc(self.vocab, words=["x", "y", "z"])
- get_examples = lambda: [Example.from_dict(doc, {})]
+
+ def get_examples():
+ return [Example.from_dict(doc, {})]
+
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(
method="Language.initialize", obj=type(get_examples)
@@ -1428,6 +1431,7 @@ class Language:
scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
scorer_cfg: Optional[Dict[str, Any]] = None,
+ per_component: bool = False,
) -> Dict[str, Any]:
"""Evaluate a model's pipeline components.
@@ -1439,6 +1443,8 @@ class Language:
arguments for specific components.
scorer_cfg (dict): An optional dictionary with extra keyword arguments
for the scorer.
+ per_component (bool): Whether to return the scores keyed by component
+ name. Defaults to False.
RETURNS (Scorer): The scorer containing the evaluation results.
@@ -1471,7 +1477,7 @@ class Language:
for eg, doc in zip(examples, docs):
eg.predicted = doc
end_time = timer()
- results = scorer.score(examples)
+ results = scorer.score(examples, per_component=per_component)
n_words = sum(len(eg.predicted) for eg in examples)
results["speed"] = n_words / (end_time - start_time)
return results
@@ -1847,7 +1853,7 @@ class Language:
# and aren't built by factory.
missing_components = _find_missing_components(pipeline, pipe_instances, exclude)
if missing_components:
- raise ValueError(Errors.E1052.format(names=", ".join(missing_components)))
+ raise ValueError(Errors.E1055.format(names=", ".join(missing_components)))
# If components are loaded from a source (existing models), we cache
# them here so they're only loaded once
source_nlps = {}
diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py
index 9b7628f0e..5125018e5 100644
--- a/spacy/ml/models/__init__.py
+++ b/spacy/ml/models/__init__.py
@@ -1,6 +1,7 @@
from .entity_linker import * # noqa
from .multi_task import * # noqa
from .parser import * # noqa
+from .span_finder import * # noqa
from .spancat import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa
diff --git a/spacy/ml/models/span_finder.py b/spacy/ml/models/span_finder.py
new file mode 100644
index 000000000..a805e2086
--- /dev/null
+++ b/spacy/ml/models/span_finder.py
@@ -0,0 +1,42 @@
+from typing import Callable, List, Tuple
+
+from thinc.api import Model, chain, with_array
+from thinc.types import Floats1d, Floats2d
+
+from ...tokens import Doc
+
+from ...util import registry
+
+InT = List[Doc]
+OutT = Floats2d
+
+
+@registry.architectures("spacy.SpanFinder.v1")
+def build_finder_model(
+ tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
+) -> Model[InT, OutT]:
+
+ logistic_layer: Model[List[Floats2d], List[Floats2d]] = with_array(scorer)
+ model: Model[InT, OutT] = chain(tok2vec, logistic_layer, flattener())
+ model.set_ref("tok2vec", tok2vec)
+ model.set_ref("scorer", scorer)
+ model.set_ref("logistic_layer", logistic_layer)
+
+ return model
+
+
+def flattener() -> Model[List[Floats2d], Floats2d]:
+ """Flattens the input to a 1-dimensional list of scores"""
+
+ def forward(
+ model: Model[Floats1d, Floats1d], X: List[Floats2d], is_train: bool
+ ) -> Tuple[Floats2d, Callable[[Floats2d], List[Floats2d]]]:
+ lens = model.ops.asarray1i([len(doc) for doc in X])
+ Y = model.ops.flatten(X)
+
+ def backprop(dY: Floats2d) -> List[Floats2d]:
+ return model.ops.unflatten(dY, lens)
+
+ return Y, backprop
+
+ return Model("Flattener", forward=forward)
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 26931606b..40e3fd638 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -2,21 +2,22 @@ from .attributeruler import AttributeRuler
from .dep_parser import DependencyParser
from .edit_tree_lemmatizer import EditTreeLemmatizer
from .entity_linker import EntityLinker
-from .ner import EntityRecognizer
from .entityruler import EntityRuler
+from .functions import merge_entities, merge_noun_chunks, merge_subtokens
from .lemmatizer import Lemmatizer
from .morphologizer import Morphologizer
+from .ner import EntityRecognizer
from .pipe import Pipe
-from .trainable_pipe import TrainablePipe
-from .senter import SentenceRecognizer
from .sentencizer import Sentencizer
+from .senter import SentenceRecognizer
+from .span_finder import SpanFinder
+from .span_ruler import SpanRuler
+from .spancat import SpanCategorizer
from .tagger import Tagger
from .textcat import TextCategorizer
-from .spancat import SpanCategorizer
-from .span_ruler import SpanRuler
from .textcat_multilabel import MultiLabel_TextCategorizer
from .tok2vec import Tok2Vec
-from .functions import merge_entities, merge_noun_chunks, merge_subtokens
+from .trainable_pipe import TrainablePipe
__all__ = [
"AttributeRuler",
@@ -31,6 +32,7 @@ __all__ = [
"SentenceRecognizer",
"Sentencizer",
"SpanCategorizer",
+ "SpanFinder",
"SpanRuler",
"Tagger",
"TextCategorizer",
diff --git a/spacy/pipeline/span_finder.py b/spacy/pipeline/span_finder.py
new file mode 100644
index 000000000..da3c38430
--- /dev/null
+++ b/spacy/pipeline/span_finder.py
@@ -0,0 +1,336 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+
+from thinc.api import Config, Model, Optimizer, set_dropout_rate
+from thinc.types import Floats2d
+
+from ..language import Language
+from .trainable_pipe import TrainablePipe
+from ..scorer import Scorer
+from ..tokens import Doc, Span
+from ..training import Example
+from ..errors import Errors
+
+from ..util import registry
+from .spancat import DEFAULT_SPANS_KEY
+
+span_finder_default_config = """
+[model]
+@architectures = "spacy.SpanFinder.v1"
+
+[model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[model.tok2vec]
+@architectures = "spacy.Tok2Vec.v2"
+
+[model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = 96
+rows = [5000, 1000, 2500, 1000]
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+include_static_vectors = false
+
+[model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = ${model.tok2vec.embed.width}
+window_size = 1
+maxout_pieces = 3
+depth = 4
+"""
+
+DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"]
+
+
+@Language.factory(
+ "span_finder",
+ assigns=["doc.spans"],
+ default_config={
+ "threshold": 0.5,
+ "model": DEFAULT_SPAN_FINDER_MODEL,
+ "spans_key": DEFAULT_SPANS_KEY,
+ "max_length": None,
+ "min_length": None,
+ "scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
+ },
+ default_score_weights={
+ f"span_finder_{DEFAULT_SPANS_KEY}_f": 1.0,
+ f"span_finder_{DEFAULT_SPANS_KEY}_p": 0.0,
+ f"span_finder_{DEFAULT_SPANS_KEY}_r": 0.0,
+ },
+)
+def make_span_finder(
+ nlp: Language,
+ name: str,
+ model: Model[Iterable[Doc], Floats2d],
+ spans_key: str,
+ threshold: float,
+ max_length: Optional[int],
+ min_length: Optional[int],
+ scorer: Optional[Callable],
+) -> "SpanFinder":
+ """Create a SpanFinder component. The component predicts whether a token is
+ the start or the end of a potential span.
+
+ model (Model[List[Doc], Floats2d]): A model instance that
+ is given a list of documents and predicts a probability for each token.
+ spans_key (str): Key of the doc.spans dict to save the spans under. During
+ initialization and training, the component will look for spans on the
+ reference document under the same key.
+ threshold (float): Minimum probability to consider a prediction positive.
+ max_length (Optional[int]): Maximum length of the produced spans, defaults
+ to None meaning unlimited length.
+ min_length (Optional[int]): Minimum length of the produced spans, defaults
+ to None meaning shortest span length is 1.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+ spans allowed.
+ """
+ return SpanFinder(
+ nlp,
+ model=model,
+ threshold=threshold,
+ name=name,
+ scorer=scorer,
+ max_length=max_length,
+ min_length=min_length,
+ spans_key=spans_key,
+ )
+
+
+@registry.scorers("spacy.span_finder_scorer.v1")
+def make_span_finder_scorer():
+ return span_finder_score
+
+
+def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+ kwargs = dict(kwargs)
+ attr_prefix = "span_finder_"
+ key = kwargs["spans_key"]
+ kwargs.setdefault("attr", f"{attr_prefix}{key}")
+ kwargs.setdefault(
+ "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
+ )
+ kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
+ kwargs.setdefault("allow_overlap", True)
+ kwargs.setdefault("labeled", False)
+ scores = Scorer.score_spans(examples, **kwargs)
+ scores.pop(f"{kwargs['attr']}_per_type", None)
+ return scores
+
+
+def _char_indices(span: Span) -> Tuple[int, int]:
+ start = span[0].idx
+ end = span[-1].idx + len(span[-1])
+ return start, end
+
+
+class SpanFinder(TrainablePipe):
+ """Pipeline that learns span boundaries.
+
+ DOCS: https://spacy.io/api/spanfinder
+ """
+
+ def __init__(
+ self,
+ nlp: Language,
+ model: Model[Iterable[Doc], Floats2d],
+ name: str = "span_finder",
+ *,
+ spans_key: str = DEFAULT_SPANS_KEY,
+ threshold: float = 0.5,
+ max_length: Optional[int] = None,
+ min_length: Optional[int] = None,
+ scorer: Optional[Callable] = span_finder_score,
+ ) -> None:
+ """Initialize the span finder.
+ model (thinc.api.Model): The Thinc Model powering the pipeline
+ component.
+ name (str): The component instance name, used to add entries to the
+ losses during training.
+ threshold (float): Minimum probability to consider a prediction
+ positive.
+ scorer (Optional[Callable]): The scoring method.
+ spans_key (str): Key of the doc.spans dict to save the spans under.
+ During initialization and training, the component will look for
+ spans on the reference document under the same key.
+ max_length (Optional[int]): Maximum length of the produced spans,
+ defaults to None meaning unlimited length.
+ min_length (Optional[int]): Minimum length of the produced spans,
+ defaults to None meaning shortest span length is 1.
+
+ DOCS: https://spacy.io/api/spanfinder#init
+ """
+ self.vocab = nlp.vocab
+ if (max_length is not None and max_length < 1) or (
+ min_length is not None and min_length < 1
+ ):
+ raise ValueError(
+ Errors.E1053.format(min_length=min_length, max_length=max_length)
+ )
+ self.model = model
+ self.name = name
+ self.scorer = scorer
+ self.cfg: Dict[str, Any] = {
+ "min_length": min_length,
+ "max_length": max_length,
+ "threshold": threshold,
+ "spans_key": spans_key,
+ }
+
+ def predict(self, docs: Iterable[Doc]):
+ """Apply the pipeline's model to a batch of docs, without modifying
+ them.
+
+ docs (Iterable[Doc]): The documents to predict.
+ RETURNS: The models prediction for each document.
+
+ DOCS: https://spacy.io/api/spanfinder#predict
+ """
+ scores = self.model.predict(docs)
+ return scores
+
+ def set_annotations(self, docs: Iterable[Doc], scores: Floats2d) -> None:
+ """Modify a batch of Doc objects, using pre-computed scores.
+ docs (Iterable[Doc]): The documents to modify.
+ scores: The scores to set, produced by SpanFinder predict method.
+
+ DOCS: https://spacy.io/api/spanfinder#set_annotations
+ """
+ offset = 0
+ for i, doc in enumerate(docs):
+ doc.spans[self.cfg["spans_key"]] = []
+ starts = []
+ ends = []
+ doc_scores = scores[offset : offset + len(doc)]
+
+ for token, token_score in zip(doc, doc_scores):
+ if token_score[0] >= self.cfg["threshold"]:
+ starts.append(token.i)
+ if token_score[1] >= self.cfg["threshold"]:
+ ends.append(token.i)
+
+ for start in starts:
+ for end in ends:
+ span_length = end + 1 - start
+ if span_length < 1:
+ continue
+ if (
+ self.cfg["min_length"] is None
+ or self.cfg["min_length"] <= span_length
+ ) and (
+ self.cfg["max_length"] is None
+ or span_length <= self.cfg["max_length"]
+ ):
+ doc.spans[self.cfg["spans_key"]].append(doc[start : end + 1])
+ offset += len(doc)
+
+ def update(
+ self,
+ examples: Iterable[Example],
+ *,
+ drop: float = 0.0,
+ sgd: Optional[Optimizer] = None,
+ losses: Optional[Dict[str, float]] = None,
+ ) -> Dict[str, float]:
+ """Learn from a batch of documents and gold-standard information,
+ updating the pipe's model. Delegates to predict and get_loss.
+ examples (Iterable[Example]): A batch of Example objects.
+ drop (float): The dropout rate.
+ sgd (Optional[thinc.api.Optimizer]): The optimizer.
+ losses (Optional[Dict[str, float]]): Optional record of the loss during
+ training. Updated using the component name as the key.
+ RETURNS (Dict[str, float]): The updated losses dictionary.
+
+ DOCS: https://spacy.io/api/spanfinder#update
+ """
+ if losses is None:
+ losses = {}
+ losses.setdefault(self.name, 0.0)
+ predicted = [eg.predicted for eg in examples]
+ set_dropout_rate(self.model, drop)
+ scores, backprop_scores = self.model.begin_update(predicted)
+ loss, d_scores = self.get_loss(examples, scores)
+ backprop_scores(d_scores)
+ if sgd is not None:
+ self.finish_update(sgd)
+ losses[self.name] += loss
+ return losses
+
+ def get_loss(self, examples, scores) -> Tuple[float, Floats2d]:
+ """Find the loss and gradient of loss for the batch of documents and
+ their predicted scores.
+ examples (Iterable[Examples]): The batch of examples.
+ scores: Scores representing the model's predictions.
+ RETURNS (Tuple[float, Floats2d]): The loss and the gradient.
+
+ DOCS: https://spacy.io/api/spanfinder#get_loss
+ """
+ truths, masks = self._get_aligned_truth_scores(examples, self.model.ops)
+ d_scores = scores - self.model.ops.asarray2f(truths)
+ d_scores *= masks
+ loss = float((d_scores**2).sum())
+ return loss, d_scores
+
+ def _get_aligned_truth_scores(self, examples, ops) -> Tuple[Floats2d, Floats2d]:
+ """Align scores of the predictions to the references for calculating
+ the loss.
+ """
+ truths = []
+ masks = []
+ for eg in examples:
+ if eg.x.text != eg.y.text:
+ raise ValueError(Errors.E1054.format(component="span_finder"))
+ n_tokens = len(eg.predicted)
+ truth = ops.xp.zeros((n_tokens, 2), dtype="float32")
+ mask = ops.xp.ones((n_tokens, 2), dtype="float32")
+ if self.cfg["spans_key"] in eg.reference.spans:
+ for span in eg.reference.spans[self.cfg["spans_key"]]:
+ ref_start_char, ref_end_char = _char_indices(span)
+ pred_span = eg.predicted.char_span(
+ ref_start_char, ref_end_char, alignment_mode="expand"
+ )
+ pred_start_char, pred_end_char = _char_indices(pred_span)
+ start_match = pred_start_char == ref_start_char
+ end_match = pred_end_char == ref_end_char
+ if start_match:
+ truth[pred_span[0].i, 0] = 1
+ else:
+ mask[pred_span[0].i, 0] = 0
+ if end_match:
+ truth[pred_span[-1].i, 1] = 1
+ else:
+ mask[pred_span[-1].i, 1] = 0
+ truths.append(truth)
+ masks.append(mask)
+ truths = ops.xp.concatenate(truths, axis=0)
+ masks = ops.xp.concatenate(masks, axis=0)
+ return truths, masks
+
+ def initialize(
+ self,
+ get_examples: Callable[[], Iterable[Example]],
+ *,
+ nlp: Optional[Language] = None,
+ ) -> None:
+ """Initialize the pipe for training, using a representative set
+ of data examples.
+ get_examples (Callable[[], Iterable[Example]]): Function that
+ returns a representative sample of gold-standard Example objects.
+ nlp (Optional[Language]): The current nlp object the component is part
+ of.
+
+ DOCS: https://spacy.io/api/spanfinder#initialize
+ """
+ subbatch: List[Example] = []
+
+ for eg in get_examples():
+ if len(subbatch) < 10:
+ subbatch.append(eg)
+
+ if subbatch:
+ docs = [eg.reference for eg in subbatch]
+ Y, _ = self._get_aligned_truth_scores(subbatch, self.model.ops)
+ self.model.initialize(X=docs, Y=Y)
+ else:
+ self.model.initialize()
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 5a087e42a..08a5478a9 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,22 +1,20 @@
-from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
from dataclasses import dataclass
from functools import partial
-from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
-from thinc.api import Optimizer
-from thinc.types import Ragged, Ints2d, Floats2d
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
import numpy
+from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
+from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
from ..compat import Protocol, runtime_checkable
-from ..scorer import Scorer
-from ..language import Language
-from .trainable_pipe import TrainablePipe
-from ..tokens import Doc, SpanGroup, Span
-from ..vocab import Vocab
-from ..training import Example, validate_examples
from ..errors import Errors
+from ..language import Language
+from ..scorer import Scorer
+from ..tokens import Doc, Span, SpanGroup
+from ..training import Example, validate_examples
from ..util import registry
-
+from ..vocab import Vocab
+from .trainable_pipe import TrainablePipe
spancat_default_config = """
[model]
@@ -33,8 +31,8 @@ hidden_size = 128
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 96
-rows = [5000, 2000, 1000, 1000]
-attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [5000, 1000, 2500, 1000]
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
@@ -71,6 +69,7 @@ maxout_pieces = 3
depth = 4
"""
+DEFAULT_SPANS_KEY = "sc"
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
spancat_singlelabel_default_config
@@ -112,6 +111,29 @@ def ngram_suggester(
return output
+def preset_spans_suggester(
+ docs: Iterable[Doc], spans_key: str, *, ops: Optional[Ops] = None
+) -> Ragged:
+ if ops is None:
+ ops = get_current_ops()
+ spans = []
+ lengths = []
+ for doc in docs:
+ length = 0
+ if doc.spans[spans_key]:
+ for span in doc.spans[spans_key]:
+ spans.append([span.start, span.end])
+ length += 1
+
+ lengths.append(length)
+ lengths_array = cast(Ints1d, ops.asarray(lengths, dtype="i"))
+ if len(spans) > 0:
+ output = Ragged(ops.asarray(spans, dtype="i"), lengths_array)
+ else:
+ output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
+ return output
+
+
@registry.misc("spacy.ngram_suggester.v1")
def build_ngram_suggester(sizes: List[int]) -> Suggester:
"""Suggest all spans of the given lengths. Spans are returned as a ragged
@@ -130,12 +152,20 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
return build_ngram_suggester(sizes)
+@registry.misc("spacy.preset_spans_suggester.v1")
+def build_preset_spans_suggester(spans_key: str) -> Suggester:
+ """Suggest all spans that are already stored in doc.spans[spans_key].
+ This is useful when an upstream component is used to set the spans
+ on the Doc such as a SpanRuler or SpanFinder."""
+ return partial(preset_spans_suggester, spans_key=spans_key)
+
+
@Language.factory(
"spancat",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
- "spans_key": "sc",
+ "spans_key": DEFAULT_SPANS_KEY,
"max_positive": None,
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
@@ -199,7 +229,7 @@ def make_spancat(
"spancat_singlelabel",
assigns=["doc.spans"],
default_config={
- "spans_key": "sc",
+ "spans_key": DEFAULT_SPANS_KEY,
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
"negative_weight": 1.0,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
diff --git a/spacy/scorer.py b/spacy/scorer.py
index de4f52be6..86cd00a50 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -121,20 +121,30 @@ class Scorer:
nlp.add_pipe(pipe)
self.nlp = nlp
- def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
+ def score(
+ self, examples: Iterable[Example], *, per_component: bool = False
+ ) -> Dict[str, Any]:
"""Evaluate a list of Examples.
examples (Iterable[Example]): The predicted annotations + correct annotations.
+ per_component (bool): Whether to return the scores keyed by component
+ name. Defaults to False.
RETURNS (Dict): A dictionary of scores.
DOCS: https://spacy.io/api/scorer#score
"""
scores = {}
if hasattr(self.nlp.tokenizer, "score"):
- scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
+ if per_component:
+ scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg)
+ else:
+ scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
for name, component in self.nlp.pipeline:
if hasattr(component, "score"):
- scores.update(component.score(examples, **self.cfg))
+ if per_component:
+ scores[name] = component.score(examples, **self.cfg)
+ else:
+ scores.update(component.score(examples, **self.cfg))
return scores
@staticmethod
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 3a5c8e451..00b8f5f1c 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -291,6 +291,11 @@ def ml_tokenizer():
return get_lang_class("ml")().tokenizer
+@pytest.fixture(scope="session")
+def ms_tokenizer():
+ return get_lang_class("ms")().tokenizer
+
+
@pytest.fixture(scope="session")
def nb_tokenizer():
return get_lang_class("nb")().tokenizer
diff --git a/spacy/tests/doc/test_span_group.py b/spacy/tests/doc/test_span_group.py
index 818569c64..cea2c42ee 100644
--- a/spacy/tests/doc/test_span_group.py
+++ b/spacy/tests/doc/test_span_group.py
@@ -93,6 +93,21 @@ def test_span_group_copy(doc):
assert span_group.attrs["key"] == "value"
assert list(span_group) != list(clone)
+ # can't copy if the character offsets don't align to tokens
+ doc2 = Doc(doc.vocab, words=[t.text + "x" for t in doc])
+ with pytest.raises(ValueError):
+ span_group.copy(doc=doc2)
+
+ # can copy with valid character offsets despite different tokenization
+ doc3 = doc.copy()
+ with doc3.retokenize() as retokenizer:
+ retokenizer.merge(doc3[0:2])
+ retokenizer.merge(doc3[3:6])
+ span_group = SpanGroup(doc, spans=[doc[0:6], doc[3:6]])
+ for span1, span2 in zip(span_group, span_group.copy(doc=doc3)):
+ assert span1.start_char == span2.start_char
+ assert span1.end_char == span2.end_char
+
def test_span_group_set_item(doc, other_doc):
span_group = doc.spans["SPANS"]
@@ -253,3 +268,12 @@ def test_span_group_typing(doc: Doc):
for i, span in enumerate(span_group):
assert span == span_group[i] == spans[i]
filter_spans(span_group)
+
+
+def test_span_group_init_doc(en_tokenizer):
+ """Test that all spans must come from the specified doc."""
+ doc1 = en_tokenizer("a b c")
+ doc2 = en_tokenizer("a b c")
+ span_group = SpanGroup(doc1, spans=[doc1[0:1], doc1[1:2]])
+ with pytest.raises(ValueError):
+ span_group = SpanGroup(doc1, spans=[doc1[0:1], doc2[1:2]])
diff --git a/spacy/tests/lang/ms/__init__.py b/spacy/tests/lang/ms/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ms/test_noun_chunks.py b/spacy/tests/lang/ms/test_noun_chunks.py
new file mode 100644
index 000000000..859307d00
--- /dev/null
+++ b/spacy/tests/lang/ms/test_noun_chunks.py
@@ -0,0 +1,8 @@
+import pytest
+
+
+def test_noun_chunks_is_parsed_ms(ms_tokenizer):
+ """Test that noun_chunks raises Value Error for 'ms' language if Doc is not parsed."""
+ doc = ms_tokenizer("sebelas")
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/ms/test_prefix_suffix_infix.py b/spacy/tests/lang/ms/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..0d2b2c507
--- /dev/null
+++ b/spacy/tests/lang/ms/test_prefix_suffix_infix.py
@@ -0,0 +1,112 @@
+import pytest
+
+
+@pytest.mark.parametrize("text", ["(Ma'arif)"])
+def test_ms_tokenizer_splits_no_special(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["Ma'arif"])
+def test_ms_tokenizer_splits_no_punct(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 1
+
+
+@pytest.mark.parametrize("text", ["(Ma'arif"])
+def test_ms_tokenizer_splits_prefix_punct(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["Ma'arif)"])
+def test_ms_tokenizer_splits_suffix_punct(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["(Ma'arif)"])
+def test_ms_tokenizer_splits_even_wrap(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(Ma'arif?)"])
+def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
+def test_ms_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize("text", ["S.Kom.)"])
+def test_ms_tokenizer_splits_suffix_interact(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["(S.Kom.)"])
+def test_ms_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(S.Kom.?)"])
+def test_ms_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize(
+ "text,length",
+ [("kerana", 1), ("Mahathir-Anwar", 3), ("Tun Dr. Ismail-Abdul Rahman", 6)],
+)
+def test_my_tokenizer_splits_hyphens(ms_tokenizer, text, length):
+ tokens = ms_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_ms_tokenizer_splits_numeric_range(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["ini.Sani", "Halo.Malaysia"])
+def test_ms_tokenizer_splits_period_infix(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["Halo,Malaysia", "satu,dua"])
+def test_ms_tokenizer_splits_comma_infix(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[0].text == text.split(",")[0]
+ assert tokens[1].text == ","
+ assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize("text", ["halo...Malaysia", "dia...pergi"])
+def test_ms_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
+ tokens = id_tokenizer(text)
+ assert len(tokens) == 3
+
+
+def test_ms_tokenizer_splits_double_hyphen_infix(id_tokenizer):
+ tokens = id_tokenizer("Arsene Wenger--pengurus Arsenal--mengadakan sidang media.")
+ assert len(tokens) == 10
+ assert tokens[0].text == "Arsene"
+ assert tokens[1].text == "Wenger"
+ assert tokens[2].text == "--"
+ assert tokens[3].text == "pengurus"
+ assert tokens[4].text == "Arsenal"
+ assert tokens[5].text == "--"
+ assert tokens[6].text == "mengadakan"
+ assert tokens[7].text == "sidang"
+ assert tokens[8].text == "media"
+ assert tokens[9].text == "."
diff --git a/spacy/tests/lang/ms/test_text.py b/spacy/tests/lang/ms/test_text.py
new file mode 100644
index 000000000..d6cd169ce
--- /dev/null
+++ b/spacy/tests/lang/ms/test_text.py
@@ -0,0 +1,8 @@
+import pytest
+from spacy.lang.ms.lex_attrs import like_num
+
+
+@pytest.mark.parametrize("word", ["sebelas"])
+def test_ms_lex_attrs_capitals(word):
+ assert like_num(word)
+ assert like_num(word.upper())
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 030182a63..7198859b3 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -728,9 +728,9 @@ def test_neg_annotation(neg_key):
ner.add_label("ORG")
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
example.reference.spans[neg_key] = [
- Span(neg_doc, 2, 4, "ORG"),
- Span(neg_doc, 2, 3, "PERSON"),
- Span(neg_doc, 1, 4, "PERSON"),
+ Span(example.reference, 2, 4, "ORG"),
+ Span(example.reference, 2, 3, "PERSON"),
+ Span(example.reference, 1, 4, "PERSON"),
]
optimizer = nlp.initialize()
@@ -755,7 +755,7 @@ def test_neg_annotation_conflict(neg_key):
ner.add_label("PERSON")
ner.add_label("LOC")
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
- example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")]
+ example.reference.spans[neg_key] = [Span(example.reference, 2, 4, "PERSON")]
assert len(example.reference.ents) == 1
assert example.reference.ents[0].text == "Shaka Khan"
assert example.reference.ents[0].label_ == "PERSON"
@@ -788,7 +788,7 @@ def test_beam_valid_parse(neg_key):
doc = Doc(nlp.vocab, words=tokens)
example = Example.from_dict(doc, {"ner": iob})
- neg_span = Span(doc, 50, 53, "ORG")
+ neg_span = Span(example.reference, 50, 53, "ORG")
example.reference.spans[neg_key] = [neg_span]
optimizer = nlp.initialize()
diff --git a/spacy/tests/pipeline/test_span_finder.py b/spacy/tests/pipeline/test_span_finder.py
new file mode 100644
index 000000000..91b08cabf
--- /dev/null
+++ b/spacy/tests/pipeline/test_span_finder.py
@@ -0,0 +1,242 @@
+import pytest
+from thinc.api import Config
+
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.pipeline.span_finder import span_finder_default_config
+from spacy.tokens import Doc
+from spacy.training import Example
+from spacy import util
+from spacy.util import registry
+from spacy.util import fix_random_seed, make_tempdir
+
+
+SPANS_KEY = "pytest"
+TRAIN_DATA = [
+ ("Who is Shaka Khan?", {"spans": {SPANS_KEY: [(7, 17)]}}),
+ (
+ "I like London and Berlin.",
+ {"spans": {SPANS_KEY: [(7, 13), (18, 24)]}},
+ ),
+]
+
+TRAIN_DATA_OVERLAPPING = [
+ ("Who is Shaka Khan?", {"spans": {SPANS_KEY: [(7, 17)]}}),
+ (
+ "I like London and Berlin",
+ {"spans": {SPANS_KEY: [(7, 13), (18, 24), (7, 24)]}},
+ ),
+ ("", {"spans": {SPANS_KEY: []}}),
+]
+
+
+def make_examples(nlp, data=TRAIN_DATA):
+ train_examples = []
+ for t in data:
+ eg = Example.from_dict(nlp.make_doc(t[0]), t[1])
+ train_examples.append(eg)
+ return train_examples
+
+
+@pytest.mark.parametrize(
+ "tokens_predicted, tokens_reference, reference_truths",
+ [
+ (
+ ["Mon", ".", "-", "June", "16"],
+ ["Mon.", "-", "June", "16"],
+ [(0, 0), (0, 0), (0, 0), (1, 1), (0, 0)],
+ ),
+ (
+ ["Mon.", "-", "J", "une", "16"],
+ ["Mon.", "-", "June", "16"],
+ [(0, 0), (0, 0), (1, 0), (0, 1), (0, 0)],
+ ),
+ (
+ ["Mon", ".", "-", "June", "16"],
+ ["Mon.", "-", "June", "1", "6"],
+ [(0, 0), (0, 0), (0, 0), (1, 1), (0, 0)],
+ ),
+ (
+ ["Mon.", "-J", "un", "e 16"],
+ ["Mon.", "-", "June", "16"],
+ [(0, 0), (0, 0), (0, 0), (0, 0)],
+ ),
+ pytest.param(
+ ["Mon.-June", "16"],
+ ["Mon.", "-", "June", "16"],
+ [(0, 1), (0, 0)],
+ ),
+ pytest.param(
+ ["Mon.-", "June", "16"],
+ ["Mon.", "-", "J", "une", "16"],
+ [(0, 0), (1, 1), (0, 0)],
+ ),
+ pytest.param(
+ ["Mon.-", "June 16"],
+ ["Mon.", "-", "June", "16"],
+ [(0, 0), (1, 0)],
+ ),
+ ],
+)
+def test_loss_alignment_example(tokens_predicted, tokens_reference, reference_truths):
+ nlp = Language()
+ predicted = Doc(
+ nlp.vocab, words=tokens_predicted, spaces=[False] * len(tokens_predicted)
+ )
+ reference = Doc(
+ nlp.vocab, words=tokens_reference, spaces=[False] * len(tokens_reference)
+ )
+ example = Example(predicted, reference)
+ example.reference.spans[SPANS_KEY] = [example.reference.char_span(5, 9)]
+ span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
+ nlp.initialize()
+ ops = span_finder.model.ops
+ if predicted.text != reference.text:
+ with pytest.raises(
+ ValueError, match="must match between reference and predicted"
+ ):
+ span_finder._get_aligned_truth_scores([example], ops)
+ return
+ truth_scores, masks = span_finder._get_aligned_truth_scores([example], ops)
+ assert len(truth_scores) == len(tokens_predicted)
+ ops.xp.testing.assert_array_equal(truth_scores, ops.xp.asarray(reference_truths))
+
+
+def test_span_finder_model():
+ nlp = Language()
+
+ docs = [nlp("This is an example."), nlp("This is the second example.")]
+ docs[0].spans[SPANS_KEY] = [docs[0][3:4]]
+ docs[1].spans[SPANS_KEY] = [docs[1][3:5]]
+
+ total_tokens = 0
+ for doc in docs:
+ total_tokens += len(doc)
+
+ config = Config().from_str(span_finder_default_config).interpolate()
+ model = registry.resolve(config)["model"]
+
+ model.initialize(X=docs)
+ predictions = model.predict(docs)
+
+ assert len(predictions) == total_tokens
+ assert len(predictions[0]) == 2
+
+
+def test_span_finder_component():
+ nlp = Language()
+
+ docs = [nlp("This is an example."), nlp("This is the second example.")]
+ docs[0].spans[SPANS_KEY] = [docs[0][3:4]]
+ docs[1].spans[SPANS_KEY] = [docs[1][3:5]]
+
+ span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
+ nlp.initialize()
+ docs = list(span_finder.pipe(docs))
+
+ assert SPANS_KEY in docs[0].spans
+
+
+@pytest.mark.parametrize(
+ "min_length, max_length, span_count",
+ [(0, 0, 0), (None, None, 8), (2, None, 6), (None, 1, 2), (2, 3, 2)],
+)
+def test_set_annotations_span_lengths(min_length, max_length, span_count):
+ nlp = Language()
+ doc = nlp("Me and Jenny goes together like peas and carrots.")
+ if min_length == 0 and max_length == 0:
+ with pytest.raises(ValueError, match="Both 'min_length' and 'max_length'"):
+ span_finder = nlp.add_pipe(
+ "span_finder",
+ config={
+ "max_length": max_length,
+ "min_length": min_length,
+ "spans_key": SPANS_KEY,
+ },
+ )
+ return
+ span_finder = nlp.add_pipe(
+ "span_finder",
+ config={
+ "max_length": max_length,
+ "min_length": min_length,
+ "spans_key": SPANS_KEY,
+ },
+ )
+ nlp.initialize()
+ # Starts [Me, Jenny, peas]
+ # Ends [Jenny, peas, carrots]
+ scores = [
+ (1, 0),
+ (0, 0),
+ (1, 1),
+ (0, 0),
+ (0, 0),
+ (0, 0),
+ (1, 1),
+ (0, 0),
+ (0, 1),
+ (0, 0),
+ ]
+ span_finder.set_annotations([doc], scores)
+
+ assert doc.spans[SPANS_KEY]
+ assert len(doc.spans[SPANS_KEY]) == span_count
+
+ # Assert below will fail when max_length is set to 0
+ if max_length is None:
+ max_length = float("inf")
+ if min_length is None:
+ min_length = 1
+
+ assert all(min_length <= len(span) <= max_length for span in doc.spans[SPANS_KEY])
+
+
+def test_overfitting_IO():
+ # Simple test to try and quickly overfit the span_finder component - ensuring the ML models work correctly
+ fix_random_seed(0)
+ nlp = English()
+ span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
+ train_examples = make_examples(nlp)
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+ assert span_finder.model.get_dim("nO") == 2
+
+ for i in range(50):
+ losses = {}
+ nlp.update(train_examples, sgd=optimizer, losses=losses)
+ assert losses["span_finder"] < 0.001
+
+ # test the trained model
+ test_text = "I like London and Berlin"
+ doc = nlp(test_text)
+ spans = doc.spans[SPANS_KEY]
+ assert len(spans) == 3
+ assert set([span.text for span in spans]) == {
+ "London",
+ "Berlin",
+ "London and Berlin",
+ }
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ spans2 = doc2.spans[SPANS_KEY]
+ assert len(spans2) == 3
+ assert set([span.text for span in spans2]) == {
+ "London",
+ "Berlin",
+ "London and Berlin",
+ }
+
+ # Test scoring
+ scores = nlp.evaluate(train_examples)
+ assert f"span_finder_{SPANS_KEY}_f" in scores
+ # It's not perfect 1.0 F1 because it's designed to overgenerate for now.
+ assert scores[f"span_finder_{SPANS_KEY}_p"] == 0.75
+ assert scores[f"span_finder_{SPANS_KEY}_r"] == 1.0
+
+ # also test that the spancat works for just a single entity in a sentence
+ doc = nlp("London")
+ assert len(doc.spans[SPANS_KEY]) == 1
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 199ef2b2a..b7024cf36 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -406,6 +406,21 @@ def test_ngram_sizes(en_tokenizer):
assert_array_equal(OPS.to_numpy(ngrams_3.lengths), [0, 1, 3, 6, 9])
+def test_preset_spans_suggester():
+ nlp = Language()
+ docs = [nlp("This is an example."), nlp("This is the second example.")]
+ docs[0].spans[SPAN_KEY] = [docs[0][3:4]]
+ docs[1].spans[SPAN_KEY] = [docs[1][0:4], docs[1][3:5]]
+ suggester = registry.misc.get("spacy.preset_spans_suggester.v1")(spans_key=SPAN_KEY)
+ candidates = suggester(docs)
+ assert type(candidates) == Ragged
+ assert len(candidates) == 2
+ assert list(candidates.dataXd[0]) == [3, 4]
+ assert list(candidates.dataXd[1]) == [0, 4]
+ assert list(candidates.dataXd[2]) == [3, 5]
+ assert list(candidates.lengths) == [1, 2]
+
+
def test_overfitting_IO():
# Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
fix_random_seed(0)
@@ -428,7 +443,7 @@ def test_overfitting_IO():
spans = doc.spans[SPAN_KEY]
assert len(spans) == 2
assert len(spans.attrs["scores"]) == 2
- assert min(spans.attrs["scores"]) > 0.9
+ assert min(spans.attrs["scores"]) > 0.8
assert set([span.text for span in spans]) == {"London", "Berlin"}
assert set([span.label_ for span in spans]) == {"LOC"}
@@ -440,7 +455,7 @@ def test_overfitting_IO():
spans2 = doc2.spans[SPAN_KEY]
assert len(spans2) == 2
assert len(spans2.attrs["scores"]) == 2
- assert min(spans2.attrs["scores"]) > 0.9
+ assert min(spans2.attrs["scores"]) > 0.8
assert set([span.text for span in spans2]) == {"London", "Berlin"}
assert set([span.label_ for span in spans2]) == {"LOC"}
diff --git a/spacy/tests/serialize/test_resource_warning.py b/spacy/tests/serialize/test_resource_warning.py
index 38701c6d9..befd05635 100644
--- a/spacy/tests/serialize/test_resource_warning.py
+++ b/spacy/tests/serialize/test_resource_warning.py
@@ -72,7 +72,7 @@ def entity_linker():
def create_kb(vocab):
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
- kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
+ kb.add_entity("test", 0.0, zeros((1,), dtype="f"))
return kb
entity_linker = nlp.add_pipe("entity_linker")
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 9ba4f0e5c..5ff4dfa26 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -103,6 +103,8 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
# project tests
+CFG_FILE = "myconfig.cfg"
+
SAMPLE_PROJECT = {
"title": "Sample project",
"description": "This is a project for testing",
@@ -128,13 +130,8 @@ SAMPLE_PROJECT = {
{
"name": "create",
"help": "make a file",
- "script": ["touch abc.txt"],
- "outputs": ["abc.txt"],
- },
- {
- "name": "clean",
- "help": "remove test file",
- "script": ["rm abc.txt"],
+ "script": [f"python -m spacy init config {CFG_FILE}"],
+ "outputs": [f"{CFG_FILE}"],
},
],
}
@@ -175,7 +172,7 @@ def test_project_assets(project_dir):
def test_project_run(project_dir):
# make sure dry run works
- test_file = project_dir / "abc.txt"
+ test_file = project_dir / CFG_FILE
result = CliRunner().invoke(
app, ["project", "run", "--dry", "create", str(project_dir)]
)
@@ -223,14 +220,13 @@ def test_project_push_pull(project_dir):
proj_text = srsly.yaml_dumps(proj)
(project_dir / "project.yml").write_text(proj_text)
- test_file = project_dir / "abc.txt"
+ test_file = project_dir / CFG_FILE
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
assert result.exit_code == 0
assert test_file.is_file()
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
assert result.exit_code == 0
- result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
- assert result.exit_code == 0
+ test_file.unlink()
assert not test_file.exists()
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
assert result.exit_code == 0
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index dbb47b423..f95c44149 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -115,6 +115,14 @@ def test_tokenization(sented_doc):
assert scores["token_r"] == approx(0.33333333)
assert scores["token_f"] == 0.4
+ # per-component scoring
+ scorer = Scorer()
+ scores = scorer.score([example], per_component=True)
+ assert scores["tokenizer"]["token_acc"] == 0.5
+ assert scores["tokenizer"]["token_p"] == 0.5
+ assert scores["tokenizer"]["token_r"] == approx(0.33333333)
+ assert scores["tokenizer"]["token_f"] == 0.4
+
def test_sents(sented_doc):
scorer = Scorer()
@@ -278,6 +286,13 @@ def test_tag_score(tagged_doc):
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
+ # per-component scoring
+ scorer = Scorer()
+ results = scorer.score([example], per_component=True)
+ assert results["tagger"]["tag_acc"] == 0.9
+ assert results["morphologizer"]["pos_acc"] == 0.9
+ assert results["morphologizer"]["morph_acc"] == approx(0.8)
+
def test_partial_annotation(en_tokenizer):
pred_doc = en_tokenizer("a b c d e")
@@ -423,14 +438,14 @@ def test_score_spans():
return doc.spans[span_key]
# Predict exactly the same, but overlapping spans will be discarded
- pred.spans[key] = spans
+ pred.spans[key] = gold.spans[key].copy(doc=pred)
eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] < 1.0
# Allow overlapping, now both precision and recall should be 100%
- pred.spans[key] = spans
+ pred.spans[key] = gold.spans[key].copy(doc=pred)
eg = Example(pred, gold)
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
assert scores[f"{key}_p"] == 1.0
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index a54b4ad3c..6c196ad78 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1264,12 +1264,14 @@ cdef class Doc:
other.user_span_hooks = dict(self.user_span_hooks)
other.length = self.length
other.max_length = self.max_length
- other.spans = self.spans.copy(doc=other)
buff_size = other.max_length + (PADDING*2)
assert buff_size > 0
tokens = other.mem.alloc(buff_size, sizeof(TokenC))
memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
other.c = &tokens[PADDING]
+ # copy spans after setting tokens so that SpanGroup.copy can verify
+ # that the start/end offsets are valid
+ other.spans = self.spans.copy(doc=other)
return other
def to_disk(self, path, *, exclude=tuple()):
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index a92f19e20..b982eb810 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -1,10 +1,12 @@
-from typing import Callable, Protocol, Iterator, Optional, Union, Tuple, Any, overload
-from thinc.types import Floats1d, Ints2d, FloatsXd
+from typing import Any, Callable, Iterator, Optional, Protocol, Tuple, Union, overload
+
+from thinc.types import Floats1d, FloatsXd, Ints2d
+
+from ..lexeme import Lexeme
+from ..vocab import Vocab
from .doc import Doc
from .token import Token
from .underscore import Underscore
-from ..lexeme import Lexeme
-from ..vocab import Vocab
class SpanMethod(Protocol):
def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@@ -51,7 +53,12 @@ class Span:
kb_id: Union[str, int] = ...,
span_id: Union[str, int] = ...,
) -> None: ...
- def __richcmp__(self, other: Span, op: int) -> bool: ...
+ def __lt__(self, other: Any) -> bool: ...
+ def __le__(self, other: Any) -> bool: ...
+ def __eq__(self, other: Any) -> bool: ...
+ def __ne__(self, other: Any) -> bool: ...
+ def __gt__(self, other: Any) -> bool: ...
+ def __ge__(self, other: Any) -> bool: ...
def __hash__(self) -> int: ...
def __len__(self) -> int: ...
def __repr__(self) -> str: ...
diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi
index 0b4aa83aa..d063bb595 100644
--- a/spacy/tokens/span_group.pyi
+++ b/spacy/tokens/span_group.pyi
@@ -1,4 +1,5 @@
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable, Iterator, Optional
+
from .doc import Doc
from .span import Span
@@ -18,7 +19,7 @@ class SpanGroup:
def doc(self) -> Doc: ...
@property
def has_overlap(self) -> bool: ...
- def __iter__(self): ...
+ def __iter__(self) -> Iterator[Span]: ...
def __len__(self) -> int: ...
def append(self, span: Span) -> None: ...
def extend(self, spans: Iterable[Span]) -> None: ...
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 608dda283..c748fa256 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -52,6 +52,8 @@ cdef class SpanGroup:
if len(spans) :
self.c.reserve(len(spans))
for span in spans:
+ if doc is not span.doc:
+ raise ValueError(Errors.E855.format(obj="span"))
self.push_back(span.c)
def __repr__(self):
@@ -261,11 +263,22 @@ cdef class SpanGroup:
"""
if doc is None:
doc = self.doc
+ if doc is self.doc:
+ spans = list(self)
+ else:
+ spans = [doc.char_span(span.start_char, span.end_char, label=span.label_, kb_id=span.kb_id, span_id=span.id) for span in self]
+ for i, span in enumerate(spans):
+ if span is None:
+ raise ValueError(Errors.E1052.format(i=i))
+ if span.kb_id in self.doc.vocab.strings:
+ doc.vocab.strings.add(span.kb_id_)
+ if span.id in span.doc.vocab.strings:
+ doc.vocab.strings.add(span.id_)
return SpanGroup(
doc,
name=self.name,
attrs=deepcopy(self.attrs),
- spans=list(self),
+ spans=spans,
)
def _concat(
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index e90617852..9cf759c55 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -133,10 +133,11 @@ def init_vocab(
logger.info("Added vectors: %s", vectors)
# warn if source model vectors are not identical
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
- vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
- for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
- if vectors_hash != sourced_vectors_hash:
- warnings.warn(Warnings.W113.format(name=sourced_component))
+ if len(sourced_vectors_hashes) > 0:
+ vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
+ for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
+ if vectors_hash != sourced_vectors_hash:
+ warnings.warn(Warnings.W113.format(name=sourced_component))
logger.info("Finished initializing nlp object")
diff --git a/spacy/ty.py b/spacy/ty.py
index 8f2903d78..7e79a3d4d 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,11 +1,13 @@
from typing import TYPE_CHECKING
from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+
from .compat import Protocol, runtime_checkable
from thinc.api import Optimizer, Model
if TYPE_CHECKING:
from .training import Example
+ from .language import Language
@runtime_checkable
@@ -32,7 +34,7 @@ class InitializableComponent(Protocol):
def initialize(
self,
get_examples: Callable[[], Iterable["Example"]],
- nlp: Iterable["Example"],
+ nlp: "Language",
**kwargs: Any
):
...
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 323ea2a92..5b4bca1ce 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1163,18 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the
$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
```
-| Name | Description |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
-| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
-| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
-| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
-| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
-| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
-| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
-| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
-| **CREATES** | Training results and optional metrics and visualizations. |
+| Name | Description |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
+| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
+| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
+| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
+| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
+| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
+| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
+| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ |
+| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
+| **CREATES** | Training results and optional metrics and visualizations. |
### speed {id="benchmark-speed", version="3.5", tag="command"}
@@ -1220,7 +1221,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
-| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
+| `output-file` | Output `DocBin` path. ~~str (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
@@ -1640,7 +1641,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details,
see the spaCy project [integration](/usage/projects#huggingface_hub).
```bash
-$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose]
+$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
```
> #### Example
@@ -1654,6 +1655,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo]
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
-| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ |
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
| **UPLOADS** | The pipeline to the hub. |
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index d84dd3ca9..21d2e9015 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -64,7 +64,7 @@ architectures and their arguments and hyperparameters.
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
+| `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 93ddd79a2..de23156b9 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
> print(scores)
> ```
-| Name | Description |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
-| _keyword-only_ | |
-| `batch_size` | The batch size to use. ~~Optional[int]~~ |
-| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
-| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name | Description |
+| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `batch_size` | The batch size to use. ~~Optional[int]~~ |
+| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
+| `per_component` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
+| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Language.use_params {id="use_params",tag="contextmanager, method"}
diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx
index 5d4affafe..018ce2524 100644
--- a/website/docs/api/morphology.mdx
+++ b/website/docs/api/morphology.mdx
@@ -213,11 +213,11 @@ Retrieve values for a feature by field.
> assert morph.get("Feat1") == ["Val1", "Val2"]
> ```
-| Name | Description |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
-| `field` | The field to retrieve. ~~str~~ |
-| `default` 3.6 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
-| **RETURNS** | A list of the individual features. ~~List[str]~~ |
+| Name | Description |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `field` | The field to retrieve. ~~str~~ |
+| `default` 3.5.3 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
+| **RETURNS** | A list of the individual features. ~~List[str]~~ |
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx
index 6f0c95f6f..9bdd0a8f4 100644
--- a/website/docs/api/scorer.mdx
+++ b/website/docs/api/scorer.mdx
@@ -33,7 +33,7 @@ Create a new `Scorer`.
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
| _keyword-only_ | |
-| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
+| `**kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
## Scorer.score {id="score",tag="method"}
@@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or
> scores = scorer.score(examples)
> ```
-| Name | Description |
-| ----------- | ------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
-| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name | Description |
+| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `per_component` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
+| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"}
diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index f54a8687b..81a473ac2 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -105,7 +105,7 @@ architectures and their arguments and hyperparameters.
>
> # Construction via add_pipe with custom model
> config = {"model": {"@architectures": "my_spancat"}}
-> parser = nlp.add_pipe("spancat", config=config)
+> spancat = nlp.add_pipe("spancat", config=config)
>
> # Construction from class
> from spacy.pipeline import SpanCategorizer
@@ -524,3 +524,22 @@ has two columns, indicating the start and end position.
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ |
| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
+
+### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}
+
+> #### Example Config
+>
+> ```ini
+> [components.spancat.suggester]
+> @misc = "spacy.preset_spans_suggester.v1"
+> spans_key = "my_spans"
+> ```
+
+Suggest all spans that are already stored in doc.spans[spans_key]. This is
+useful when an upstream component is used to set the spans on the Doc such as a
+[`SpanRuler`](/api/spanruler) or [`SpanFinder`](/api/spanfinder).
+
+| Name | Description |
+| ----------- | ----------------------------------------------------------------------------- |
+| `spans_key` | Key of [`Doc.spans`](/api/doc/#spans) that provides spans to suggest. ~~str~~ |
+| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
diff --git a/website/docs/api/spanfinder.mdx b/website/docs/api/spanfinder.mdx
new file mode 100644
index 000000000..ca3104c85
--- /dev/null
+++ b/website/docs/api/spanfinder.mdx
@@ -0,0 +1,372 @@
+---
+title: SpanFinder
+tag: class,experimental
+source: spacy/pipeline/span_finder.py
+version: 3.6
+teaser:
+ 'Pipeline component for identifying potentially overlapping spans of text'
+api_base_class: /api/pipe
+api_string_name: span_finder
+api_trainable: true
+---
+
+The span finder identifies potentially overlapping, unlabeled spans. It
+identifies tokens that start or end spans and annotates unlabeled spans between
+starts and ends, with optional filters for min and max span length. It is
+intended for use in combination with a component like
+[`SpanCategorizer`](/api/spancategorizer) that may further filter or label the
+spans. Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the
+doc under `doc.spans[spans_key]`, where `spans_key` is a component config
+setting.
+
+## Assigned Attributes {id="assigned-attributes"}
+
+Predictions will be saved to `Doc.spans[spans_key]` as a
+[`SpanGroup`](/api/spangroup).
+
+`spans_key` defaults to `"sc"`, but can be passed as a parameter. The
+`span_finder` component will overwrite any existing spans under the spans key
+`doc.spans[spans_key]`.
+
+| Location | Value |
+| ---------------------- | ---------------------------------- |
+| `Doc.spans[spans_key]` | The unlabeled spans. ~~SpanGroup~~ |
+
+## Config and implementation {id="config"}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures) documentation for details on the
+architectures and their arguments and hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy.pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL
+> config = {
+> "threshold": 0.5,
+> "spans_key": "my_spans",
+> "max_length": None,
+> "min_length": None,
+> "model": DEFAULT_SPAN_FINDER_MODEL,
+> }
+> nlp.add_pipe("span_finder", config=config)
+> ```
+
+| Setting | Description |
+| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
+| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
+| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
+| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
+| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
+
+```python
+%%GITHUB_SPACY/spacy/pipeline/span_finder.py
+```
+
+## SpanFinder.\_\_init\_\_ {id="init",tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> span_finder = nlp.add_pipe("span_finder")
+>
+> # Construction via add_pipe with custom model
+> config = {"model": {"@architectures": "my_span_finder"}}
+> span_finder = nlp.add_pipe("span_finder", config=config)
+>
+> # Construction from class
+> from spacy.pipeline import SpanFinder
+> span_finder = SpanFinder(nlp.vocab, model)
+> ```
+
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#create_pipe).
+
+| Name | Description |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
+| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
+| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
+| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
+
+## SpanFinder.\_\_call\_\_ {id="call",tag="method"}
+
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/spanfinder#call) and [`pipe`](/api/spanfinder#pipe) delegate
+to the [`predict`](/api/spanfinder#predict) and
+[`set_annotations`](/api/spanfinder#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> span_finder = nlp.add_pipe("span_finder")
+> # This usually happens under the hood
+> processed = span_finder(doc)
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------- |
+| `doc` | The document to process. ~~Doc~~ |
+| **RETURNS** | The processed document. ~~Doc~~ |
+
+## SpanFinder.pipe {id="pipe",tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/spanfinder#call) and
+[`pipe`](/api/spanfinder#pipe) delegate to the
+[`predict`](/api/spanfinder#predict) and
+[`set_annotations`](/api/spanfinder#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> for doc in span_finder.pipe(docs, batch_size=50):
+> pass
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------- |
+| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
+| _keyword-only_ | |
+| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
+| **YIELDS** | The processed documents in order. ~~Doc~~ |
+
+## SpanFinder.initialize {id="initialize",tag="method"}
+
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. **At least one example
+should be supplied.** The data examples are used to **initialize the model** of
+the component and can either be the full training data or a representative
+sample. Initialization includes validating the network and
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) This
+method is typically called by [`Language.initialize`](/api/language#initialize)
+and lets you customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.initialize(lambda: examples, nlp=nlp)
+> ```
+
+| Name | Description |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ | |
+| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
+
+## SpanFinder.predict {id="predict",tag="method"}
+
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
+modifying them.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict([doc1, doc2])
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------- |
+| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
+| **RETURNS** | The model's prediction for each document. |
+
+## SpanFinder.set_annotations {id="set_annotations",tag="method"}
+
+Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict(docs)
+> span_finder.set_annotations(docs, scores)
+> ```
+
+| Name | Description |
+| -------- | ---------------------------------------------------- |
+| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
+| `scores` | The scores to set, produced by `SpanFinder.predict`. |
+
+## SpanFinder.update {id="update",tag="method"}
+
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/spanfinder#predict) and
+[`get_loss`](/api/spanfinder#get_loss).
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> optimizer = nlp.initialize()
+> losses = span_finder.update(examples, sgd=optimizer)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `drop` | The dropout rate. ~~float~~ |
+| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
+| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
+
+## SpanFinder.get_loss {id="get_loss",tag="method"}
+
+Find the loss and gradient of loss for the batch of documents and their
+predicted scores.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict([eg.predicted for eg in examples])
+> loss, d_loss = span_finder.get_loss(examples, scores)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------ |
+| `examples` | The batch of examples. ~~Iterable[Example]~~ |
+| `spans_scores` | Scores representing the model's predictions. ~~Tuple[Ragged, Floats2d]~~ |
+| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, Floats2d]~~ |
+
+## SpanFinder.create_optimizer {id="create_optimizer",tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> optimizer = span_finder.create_optimizer()
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------- |
+| **RETURNS** | The optimizer. ~~Optimizer~~ |
+
+## SpanFinder.use_params {id="use_params",tag="method, contextmanager"}
+
+Modify the pipe's model to use the given parameter values.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> with span_finder.use_params(optimizer.averages):
+> span_finder.to_disk("/best_model")
+> ```
+
+| Name | Description |
+| -------- | -------------------------------------------------- |
+| `params` | The parameter values to use in the model. ~~dict~~ |
+
+## SpanFinder.to_disk {id="to_disk",tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.to_disk("/path/to/span_finder")
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+
+## SpanFinder.from_disk {id="from_disk",tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.from_disk("/path/to/span_finder")
+> ```
+
+| Name | Description |
+| -------------- | ----------------------------------------------------------------------------------------------- |
+| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The modified `SpanFinder` object. ~~SpanFinder~~ |
+
+## SpanFinder.to_bytes {id="to_bytes",tag="method"}
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder_bytes = span_finder.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The serialized form of the `SpanFinder` object. ~~bytes~~ |
+
+## SpanFinder.from_bytes {id="from_bytes",tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> span_finder_bytes = span_finder.to_bytes()
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.from_bytes(span_finder_bytes)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `bytes_data` | The data to load from. ~~bytes~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The `SpanFinder` object. ~~SpanFinder~~ |
+
+## Serialization fields {id="serialization-fields"}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = span_finder.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name | Description |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab). |
+| `cfg` | The config file. You usually don't want to exclude this. |
+| `model` | The binary model data. You usually don't want to exclude this. |
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 6de1acdf0..64ec342cd 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -469,7 +469,7 @@ factories.
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
-| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. |
+| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`. |
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
### spacy-transformers registry {id="registry-transformers"}
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index a5b7990d6..4b06178d5 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -259,6 +259,26 @@ source code and recompiling frequently.
$ python setup.py develop
```
+#### Visual Studio Code extension
+
+![spaCy extension demo](/images/spacy-extension-demo.gif)
+
+The [spaCy VSCode Extension](https://github.com/explosion/spacy-vscode) provides
+additional tooling and features for working with spaCy's config files. Version
+1.0.0 includes hover descriptions for registry functions, variables, and section
+names within the config as an installable extension.
+
+1. Install a supported version of Python on your system (`>=3.7`)
+2. Install the
+ [Python Extension for Visual Studio Code](https://code.visualstudio.com/docs/python/python-tutorial)
+3. Create a
+ [virtual python environment](https://docs.python.org/3/library/venv.html)
+4. Install all python requirements (`spaCy >= 3.4.0` & `pygls >= 1.0.0`)
+5. Install
+ [spaCy extension for Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=Explosion.spacy-extension)
+6. Select your python environment
+7. You are ready to work with `.cfg` files in spaCy!
+
### Building an executable {id="executable"}
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx
index c372744de..1ac931753 100644
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@@ -56,14 +56,19 @@ wrap. So if you come across this problem, especially when using custom labels,
you'll have to increase the `distance` setting in the `options` to allow longer
arcs.
+Moreover, you might need to modify the `offset_x` argument depending on the shape
+of your document. Otherwise, the left part of the document may overflow beyond the
+container's border.
+
-| Argument | Description |
-| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
-| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
-| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
-| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
+| Argument | Description |
+| ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
+| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
+| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
+| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
+| `offset_x` | Spacing on left side of the SVG in px. You might need to tweak this setting for long texts. Defaults to `50`. ~~int~~ |
For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options).
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 46c0d3adb..f88d2b7bf 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -264,6 +264,11 @@
"code": "mr",
"name": "Marathi"
},
+ {
+ "code": "ms",
+ "name": "Malay",
+ "has_examples": true
+ },
{
"code": "nb",
"name": "Norwegian Bokmål",
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index b5c555da6..12c3fce35 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -106,6 +106,7 @@
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
{ "text": "Sentencizer", "url": "/api/sentencizer" },
{ "text": "SpanCategorizer", "url": "/api/spancategorizer" },
+ { "text": "SpanFinder", "url": "/api/spanfinder" },
{ "text": "SpanResolver", "url": "/api/span-resolver" },
{ "text": "SpanRuler", "url": "/api/spanruler" },
{ "text": "Tagger", "url": "/api/tagger" },
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 4067c4d1e..c2047c97d 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,72 @@
{
"resources": [
+ {
+ "id": "spacy-vscode",
+ "title": "spaCy Visual Studio Code Extension",
+ "thumb": "https://raw.githubusercontent.com/explosion/spacy-vscode/main/icon.png",
+ "slogan": "Work with spaCy's config files in VS Code",
+ "description": "The spaCy VS Code Extension provides additional tooling and features for working with spaCy's config files. Version 1.0.0 includes hover descriptions for registry functions, variables, and section names within the config as an installable extension.",
+ "url": "https://marketplace.visualstudio.com/items?itemName=Explosion.spacy-extension",
+ "github": "explosion/spacy-vscode",
+ "code_language": "python",
+ "author": "Explosion",
+ "author_links": {
+ "twitter": "@explosion_ai",
+ "github": "explosion"
+ },
+ "category": ["extension"],
+ "tags": []
+ },
+ {
+ "id": "parsigs",
+ "title": "parsigs",
+ "slogan": "Structuring prescriptions text made simple using spaCy",
+ "description": "Parsigs is an open-source project that aims to extract the relevant dosage information from prescriptions text without compromising the patient's privacy.\n\nNotice you also need to install the model in order to use the package: `pip install https://huggingface.co/royashcenazi/en_parsigs/resolve/main/en_parsigs-any-py3-none-any.whl`",
+ "github": "royashcenazi/parsigs",
+ "pip": "parsigs",
+ "code_language": "python",
+ "author": "Roy Ashcenazi",
+ "code_example": [
+ "# You'll need to install the trained model, see instructions in the description section",
+ "from parsigs.parse_sig_api import StructuredSig, SigParser",
+ "sig_parser = SigParser()",
+ "",
+ "sig = 'Take 1 tablet of ibuprofen 200mg 3 times every day for 3 weeks'",
+ "parsed_sig = sig_parser.parse(sig)"
+ ],
+ "author_links": {
+ "github": "royashcenazi"
+ },
+ "category": ["model", "research", "biomedical"],
+ "tags": ["sigs", "prescription","pharma"]
+ },
+ {
+ "id": "latincy",
+ "title": "LatinCy",
+ "thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png",
+ "slogan": "Synthetic trained spaCy pipelines for Latin NLP",
+ "description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.",
+ "url": "https://huggingface.co/latincy",
+ "code_example": [
+ "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
+ "import spacy",
+ "nlp = spacy.load('la_core_web_lg')",
+ "doc = nlp('Haec narrantur a poetis de Perseo')",
+ "",
+ "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
+ "",
+ "# > Haec, haec, hic, DET"
+ ],
+ "code_language": "python",
+ "author": "Patrick J. Burns",
+ "author_links": {
+ "twitter": "@diyclassics",
+ "github": "diyclassics",
+ "website": "https://diyclassics.github.io/"
+ },
+ "category": ["pipeline", "research"],
+ "tags": ["latin"]
+ },
{
"id": "spacy-wasm",
"title": "spacy-wasm",
@@ -334,7 +401,7 @@
},
{
"id": "spacypdfreader",
- "title": "spadypdfreader",
+ "title": "spacypdfreader",
"category": ["pipeline"],
"tags": ["PDF"],
"slogan": "Easy PDF to text to spaCy text extraction in Python.",
@@ -351,7 +418,7 @@
},
"code_example": [
"import spacy",
- "from spacypdfreader import pdf_reader",
+ "from spacypdfreader.spacypdfreader import pdf_reader",
"",
"nlp = spacy.load('en_core_web_sm')",
"doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
@@ -2810,6 +2877,58 @@
"tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"],
"spacy_version": 3
},
+ {
+ "id": "adeptaugmentations",
+ "title": "Adept Augmentations",
+ "slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.",
+ "description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".",
+ "github": "argilla-io/adept-augmentations",
+ "pip": "adept-augmentations",
+ "thumb": "https://raw.githubusercontent.com/argilla-io/adept-augmentations/main/logo.png",
+ "code_example": [
+ "from adept_augmentations import EntitySwapAugmenter",
+ "import spacy",
+ "from spacy.tokens import Doc, DocBin",
+ "nlp = spacy.blank(\"en\")",
+ "",
+ "# Create some example golden data",
+ "example_data = [",
+ " (\"Apple is looking at buying U.K. startup for $1 billion\", [(0, 5, \"ORG\"), (27, 31, \"LOC\"), (44, 54, \"MONEY\")]),",
+ " (\"Microsoft acquires GitHub for $7.5 billion\", [(0, 9, \"ORG\"), (19, 25, \"ORG\"), (30, 42, \"MONEY\")]),",
+ "]",
+ "",
+ "# Create a new DocBin",
+ "nlp = spacy.blank(\"en\")",
+ "docs = []",
+ "for entry in example_data:",
+ " doc = Doc(nlp.vocab, words=entry[0].split())",
+ " doc.ents = [doc.char_span(ent[0], ent[1], label=ent[2]) for ent in entry[1]]",
+ " docs.append(doc)",
+ "golden_dataset = DocBin(docs=docs)",
+ "",
+ "# Augment Data",
+ "augmented_dataset = EntitySwapAugmenter(golden_dataset).augment(4)",
+ "for doc in augmented_dataset.get_docs(nlp.vocab):",
+ " print(doc.text)",
+ "",
+ "# GitHub is looking at buying U.K. startup for $ 7.5 billion",
+ "# Microsoft is looking at buying U.K. startup for $ 1 billion",
+ "# Microsoft is looking at buying U.K. startup for $ 7.5 billion",
+ "# GitHub is looking at buying U.K. startup for $ 1 billion",
+ "# Microsoft acquires Apple for $ 7.5 billion",
+ "# Apple acquires Microsoft for $ 1 billion",
+ "# Microsoft acquires Microsoft for $ 7.5 billion",
+ "# GitHub acquires GitHub for $ 1 billion"
+ ],
+ "author": "David Berenstein",
+ "author_links": {
+ "github": "davidberenstein1957",
+ "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
+ },
+ "category": ["standalone"],
+ "tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
+ "spacy_version": 3
+ },
{
"id": "blackstone",
"title": "Blackstone",
@@ -4162,6 +4281,37 @@
},
"category": ["pipeline", "research"],
"tags": ["Thai"]
+ },
+ {
+ "id": "vetiver",
+ "title": "Vetiver",
+ "slogan": "Version, share, deploy, and monitor models.",
+ "description": "The goal of vetiver is to provide fluent tooling to version, deploy, and monitor a trained model. Functions handle creating model objects, versioning models, predicting from a remote API endpoint, deploying Dockerfiles, and more.",
+ "github": "rstudio/vetiver-python",
+ "pip": "vetiver",
+ "code_example": [
+ "import spacy",
+ "from vetiver import VetiverModel, VetiverAPI",
+ "",
+ "# If you use this model, you'll need to download it first:",
+ "# python -m spacy download en_core_web_md",
+ "nlp = spacy.load('en_core_web_md')",
+ "# Create deployable model object with your nlp Language object",
+ "v = VetiverModel(nlp, model_name = 'my_model')",
+ "# Try out your API endpoint locally",
+ "VetiverAPI(v).run()"
+ ],
+ "code_language": "python",
+ "url": "https://vetiver.rstudio.com/",
+ "thumb": "https://raw.githubusercontent.com/rstudio/vetiver-python/main/docs/figures/square-logo.svg",
+ "author": "Posit, PBC",
+ "author_links": {
+ "twitter": "posit_pbc",
+ "github": "rstudio",
+ "website": "https://posit.co/"
+ },
+ "category": ["apis", "standalone"],
+ "tags": ["apis", "deployment"]
}
],
diff --git a/website/public/images/spacy-extension-demo.gif b/website/public/images/spacy-extension-demo.gif
new file mode 100644
index 000000000..a857bbe2d
Binary files /dev/null and b/website/public/images/spacy-extension-demo.gif differ