diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
deleted file mode 100644
index 20d4582cb..000000000
--- a/.github/azure-steps.yml
+++ /dev/null
@@ -1,118 +0,0 @@
-parameters:
- python_version: ''
- architecture: 'x64'
- num_build_jobs: 2
-
-steps:
- - task: UsePythonVersion@0
- inputs:
- versionSpec: ${{ parameters.python_version }}
- architecture: ${{ parameters.architecture }}
- allowUnstable: true
-
- - bash: |
- echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
- displayName: 'Set variables'
-
- - script: |
- python -m pip install -U build pip setuptools
- python -m pip install -U -r requirements.txt
- displayName: "Install dependencies"
-
- - script: |
- python -m build --sdist
- displayName: "Build sdist"
-
- - script: |
- python -m mypy spacy
- displayName: 'Run mypy'
- condition: ne(variables['python_version'], '3.6')
-
- - task: DeleteFiles@1
- inputs:
- contents: "spacy"
- displayName: "Delete source directory"
-
- - task: DeleteFiles@1
- inputs:
- contents: "*.egg-info"
- displayName: "Delete egg-info directory"
-
- - script: |
- python -m pip freeze > installed.txt
- python -m pip uninstall -y -r installed.txt
- displayName: "Uninstall all packages"
-
- - bash: |
- SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
- SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
- displayName: "Install from sdist"
-
- - script: |
- python -W error -c "import spacy"
- displayName: "Test import"
-
- - script: |
- python -m spacy download ca_core_news_sm
- python -m spacy download ca_core_news_md
- python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
- displayName: 'Test download CLI'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- python -W error -m spacy info ca_core_news_sm | grep -q download_url
- displayName: 'Test download_url in info CLI'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
- displayName: 'Test no warnings on load (#11713)'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
- displayName: 'Test convert CLI'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- python -m spacy init config -p ner -l ca ner.cfg
- python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
- displayName: 'Test debug config CLI'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- # will have errors due to sparse data, check for summary in output
- python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
- displayName: 'Test debug data CLI'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
- displayName: 'Test train CLI'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
- PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
- displayName: 'Test assemble CLI'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
- python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
- displayName: 'Test assemble CLI vectors warning'
- condition: eq(variables['python_version'], '3.9')
-
- - script: |
- python -m pip install -U -r requirements.txt
- displayName: "Install test requirements"
-
- - script: |
- python -m pytest --pyargs spacy -W error
- displayName: "Run CPU tests"
-
- - script: |
- python -m pip install 'spacy[apple]'
- python -m pytest --pyargs spacy
- displayName: "Run CPU tests with thinc-apple-ops"
- condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index 83c57a164..000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,120 +0,0 @@
-trigger:
- batch: true
- branches:
- include:
- - "*"
- exclude:
- - "spacy.io"
- - "nightly.spacy.io"
- - "v2.spacy.io"
- paths:
- exclude:
- - "website/*"
- - "*.md"
- - "*.mdx"
- - ".github/workflows/*"
-pr:
- paths:
- exclude:
- - "*.md"
- - "*.mdx"
- - "website/docs/*"
- - "website/src/*"
- - "website/meta/*.tsx"
- - "website/meta/*.mjs"
- - "website/meta/languages.json"
- - "website/meta/site.json"
- - "website/meta/sidebars.json"
- - "website/meta/type-annotations.json"
- - "website/pages/*"
- - ".github/workflows/*"
-
-jobs:
- # Check formatting and linting. Perform basic checks for most important errors
- # (syntax etc.) Uses the config defined in setup.cfg and overwrites the
- # selected codes.
- - job: "Validate"
- pool:
- vmImage: "ubuntu-latest"
- steps:
- - task: UsePythonVersion@0
- inputs:
- versionSpec: "3.7"
- - script: |
- pip install black -c requirements.txt
- python -m black spacy --check
- displayName: "black"
- - script: |
- pip install flake8==5.0.4
- python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
- displayName: "flake8"
- - script: |
- python .github/validate_universe_json.py website/meta/universe.json
- displayName: 'Validate website/meta/universe.json'
-
- - job: "Test"
- dependsOn: "Validate"
- strategy:
- matrix:
- # We're only running one platform per Python version to speed up builds
- Python36Linux:
- imageName: "ubuntu-20.04"
- python.version: "3.6"
- # Python36Windows:
- # imageName: "windows-latest"
- # python.version: "3.6"
- # Python36Mac:
- # imageName: "macos-latest"
- # python.version: "3.6"
- # Python37Linux:
- # imageName: "ubuntu-20.04"
- # python.version: "3.7"
- Python37Windows:
- imageName: "windows-latest"
- python.version: "3.7"
- # Python37Mac:
- # imageName: "macos-latest"
- # python.version: "3.7"
- # Python38Linux:
- # imageName: "ubuntu-latest"
- # python.version: "3.8"
- # Python38Windows:
- # imageName: "windows-latest"
- # python.version: "3.8"
- Python38Mac:
- imageName: "macos-latest"
- python.version: "3.8"
- Python39Linux:
- imageName: "ubuntu-latest"
- python.version: "3.9"
- # Python39Windows:
- # imageName: "windows-latest"
- # python.version: "3.9"
- # Python39Mac:
- # imageName: "macos-latest"
- # python.version: "3.9"
- # Python310Linux:
- # imageName: "ubuntu-latest"
- # python.version: "3.10"
- Python310Windows:
- imageName: "windows-latest"
- python.version: "3.10"
- # Python310Mac:
- # imageName: "macos-latest"
- # python.version: "3.10"
- Python311Linux:
- imageName: 'ubuntu-latest'
- python.version: '3.11'
- Python311Windows:
- imageName: 'windows-latest'
- python.version: '3.11'
- Python311Mac:
- imageName: 'macos-latest'
- python.version: '3.11'
- maxParallel: 4
- pool:
- vmImage: $(imageName)
- steps:
- - template: .github/azure-steps.yml
- parameters:
- python_version: '$(python.version)'
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 97b4db285..2826cd084 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -337,7 +337,7 @@ def debug_data(
show=verbose,
)
else:
- msg.good("Examples without ocurrences available for all labels")
+ msg.good("Examples without occurrences available for all labels")
if "ner" in factory_names:
# Get all unique NER labels present in the data
diff --git a/spacy/lang/la/__init__.py b/spacy/lang/la/__init__.py
index 15b87c5b9..37164c3f3 100644
--- a/spacy/lang/la/__init__.py
+++ b/spacy/lang/la/__init__.py
@@ -2,12 +2,14 @@ from ...language import Language, BaseDefaults
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
+from .syntax_iterators import SYNTAX_ITERATORS
class LatinDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
+ syntax_iterators = SYNTAX_ITERATORS
class Latin(Language):
diff --git a/spacy/lang/la/examples.py b/spacy/lang/la/examples.py
new file mode 100644
index 000000000..db8550070
--- /dev/null
+++ b/spacy/lang/la/examples.py
@@ -0,0 +1,22 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.la.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+# > Caes. BG 1.1
+# > Cic. De Amic. 1
+# > V. Georg. 1.1-5
+# > Gen. 1:1
+# > Galileo, Sid. Nunc.
+# > van Schurman, Opusc. arg. 1
+
+sentences = [
+ "Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.",
+ "Q. Mucius augur multa narrare de C. Laelio socero suo memoriter et iucunde solebat nec dubitare illum in omni sermone appellare sapientem.",
+ "Quid faciat laetas segetes, quo sidere terram uertere, Maecenas, ulmisque adiungere uitis conueniat, quae cura boum, qui cultus habendo sit pecori, apibus quanta experientia parcis, hinc canere incipiam",
+ "In principio creavit Deus caelum et terram.",
+ "Quo sumpto, intelligatur lunaris globus, cuius maximus circulus CAF, centrum vero E, dimetiens CF, qui ad Terre diametrum est ut duo ad septem.",
+ "Cuicunque natura indita sunt principia, seu potentiae principiorum omnium artium, ac scientiarum, ei conveniunt omnes artes ac scientiae.",
+]
diff --git a/spacy/lang/la/lex_attrs.py b/spacy/lang/la/lex_attrs.py
index 9efb4dd3c..9db1218a4 100644
--- a/spacy/lang/la/lex_attrs.py
+++ b/spacy/lang/la/lex_attrs.py
@@ -6,17 +6,16 @@ roman_numerals_compile = re.compile(
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
)
-_num_words = set(
- """
-unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
+_num_words = """unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem undecim duodecim tredecim quattuordecim quindecim sedecim septendecim duodeviginti undeviginti viginti triginta quadraginta quinquaginta sexaginta septuaginta octoginta nonaginta centum ducenti ducentae ducenta trecenti trecentae trecenta quadringenti quadringentae quadringenta quingenti quingentae quingenta sescenti sescentae sescenta septingenti septingentae septingenta octingenti octingentae octingenta nongenti nongentae nongenta mille
""".split()
-)
-_ordinal_words = set(
- """
-primus prima primum secundus secunda secundum tertius tertia tertium
-""".split()
-)
+_num_words += [item.replace("v", "u") for item in _num_words]
+_num_words = set(_num_words)
+
+_ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split()
+
+_ordinal_words += [item.replace("v", "u") for item in _ordinal_words]
+_ordinal_words = set(_ordinal_words)
def like_num(text):
diff --git a/spacy/lang/la/syntax_iterators.py b/spacy/lang/la/syntax_iterators.py
new file mode 100644
index 000000000..7093bacf9
--- /dev/null
+++ b/spacy/lang/la/syntax_iterators.py
@@ -0,0 +1,85 @@
+from typing import Union, Iterator, Tuple
+from ...tokens import Doc, Span
+from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from ...errors import Errors
+
+# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB]
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ def is_verb_token(tok):
+ return tok.pos in [VERB, AUX]
+
+ def get_left_bound(root):
+ left_bound = root
+ for tok in reversed(list(root.lefts)):
+ if tok.dep in np_left_deps:
+ left_bound = tok
+ return left_bound
+
+ def get_right_bound(doc, root):
+ right_bound = root
+ for tok in root.rights:
+ if tok.dep in np_right_deps:
+ right = get_right_bound(doc, tok)
+ if list(
+ filter(
+ lambda t: is_verb_token(t) or t.dep in stop_deps,
+ doc[root.i : right.i],
+ )
+ ):
+ break
+ else:
+ right_bound = right
+ return right_bound
+
+ def get_bounds(doc, root):
+ return get_left_bound(root), get_right_bound(doc, root)
+
+ doc = doclike.doc # Ensure works on both Doc and Span.
+
+ if not doc.has_annotation("DEP"):
+ raise ValueError(Errors.E029)
+
+ if not len(doc):
+ return
+
+ left_labels = [
+ "det",
+ "fixed",
+ "nmod:poss",
+ "amod",
+ "flat",
+ "goeswith",
+ "nummod",
+ "appos",
+ ]
+ right_labels = [
+ "fixed",
+ "nmod:poss",
+ "amod",
+ "flat",
+ "goeswith",
+ "nummod",
+ "appos",
+ "nmod",
+ "det",
+ ]
+ stop_labels = ["punct"]
+
+ np_label = doc.vocab.strings.add("NP")
+ np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
+ np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
+ stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
+
+ prev_right = -1
+ for token in doclike:
+ if token.pos in [PROPN, NOUN, PRON]:
+ left, right = get_bounds(doc, token)
+ if left.i <= prev_right:
+ continue
+ yield left.i, right.i + 1, np_label
+ prev_right = right.i
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/la/tokenizer_exceptions.py b/spacy/lang/la/tokenizer_exceptions.py
index 060f6e085..6d14b92c5 100644
--- a/spacy/lang/la/tokenizer_exceptions.py
+++ b/spacy/lang/la/tokenizer_exceptions.py
@@ -12,65 +12,15 @@ _exc = {
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
}
-for orth in [
- "A.",
- "Agr.",
- "Ap.",
- "C.",
- "Cn.",
- "D.",
- "F.",
- "K.",
- "L.",
- "M'.",
- "M.",
- "Mam.",
- "N.",
- "Oct.",
- "Opet.",
- "P.",
- "Paul.",
- "Post.",
- "Pro.",
- "Q.",
- "S.",
- "Ser.",
- "Sert.",
- "Sex.",
- "St.",
- "Sta.",
- "T.",
- "Ti.",
- "V.",
- "Vol.",
- "Vop.",
- "U.",
- "Uol.",
- "Uop.",
- "Ian.",
- "Febr.",
- "Mart.",
- "Apr.",
- "Mai.",
- "Iun.",
- "Iul.",
- "Aug.",
- "Sept.",
- "Oct.",
- "Nov.",
- "Nou.",
- "Dec.",
- "Non.",
- "Id.",
- "A.D.",
- "Coll.",
- "Cos.",
- "Ord.",
- "Pl.",
- "S.C.",
- "Suff.",
- "Trib.",
-]:
+_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split()
+
+_abbrev_exc += [item.lower() for item in _abbrev_exc]
+_abbrev_exc += [item.upper() for item in _abbrev_exc]
+_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc]
+
+_abbrev_exc += ["d.N."]
+
+for orth in set(_abbrev_exc):
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index adf96702b..48fb3eb2a 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -432,22 +432,22 @@ cdef class DependencyMatcher:
return [doc[child.i] for child in doc[node].head.children if child.i < node]
def _imm_right_child(self, doc, node):
- for child in doc[node].children:
+ for child in doc[node].rights:
if child.i == node + 1:
return [doc[child.i]]
return []
def _imm_left_child(self, doc, node):
- for child in doc[node].children:
+ for child in doc[node].lefts:
if child.i == node - 1:
return [doc[child.i]]
return []
def _right_child(self, doc, node):
- return [doc[child.i] for child in doc[node].children if child.i > node]
+ return [child for child in doc[node].rights]
def _left_child(self, doc, node):
- return [doc[child.i] for child in doc[node].children if child.i < node]
+ return [child for child in doc[node].lefts]
def _imm_right_parent(self, doc, node):
if doc[node].head.i == node + 1:
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 918d4acdc..49e32b936 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -33,6 +33,8 @@ def test_token_morph_key(i_has):
def test_morph_props(i_has):
assert i_has[0].morph.get("PronType") == ["prs"]
assert i_has[1].morph.get("PronType") == []
+ assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"]
+ assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"]
def test_morph_iter(i_has):
diff --git a/spacy/tests/lang/la/test_noun_chunks.py b/spacy/tests/lang/la/test_noun_chunks.py
new file mode 100644
index 000000000..ba8f5658b
--- /dev/null
+++ b/spacy/tests/lang/la/test_noun_chunks.py
@@ -0,0 +1,52 @@
+import pytest
+from spacy.tokens import Doc
+
+
+def test_noun_chunks_is_parsed(la_tokenizer):
+ """Test that noun_chunks raises Value Error for 'la' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = la_tokenizer("Haec est sententia.")
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
+
+
+LA_NP_TEST_EXAMPLES = [
+ (
+ "Haec narrantur a poetis de Perseo.",
+ ["DET", "VERB", "ADP", "NOUN", "ADP", "PROPN", "PUNCT"],
+ ["nsubj:pass", "ROOT", "case", "obl", "case", "obl", "punct"],
+ [1, 0, -1, -1, -3, -1, -5],
+ ["poetis", "Perseo"],
+ ),
+ (
+ "Perseus autem in sinu matris dormiebat.",
+ ["NOUN", "ADV", "ADP", "NOUN", "NOUN", "VERB", "PUNCT"],
+ ["nsubj", "discourse", "case", "obl", "nmod", "ROOT", "punct"],
+ [5, 4, 3, -1, -1, 0, -1],
+ ["Perseus", "sinu matris"],
+ ),
+]
+
+
+@pytest.mark.parametrize(
+ "text,pos,deps,heads,expected_noun_chunks", LA_NP_TEST_EXAMPLES
+)
+def test_la_noun_chunks(la_tokenizer, text, pos, deps, heads, expected_noun_chunks):
+ tokens = la_tokenizer(text)
+
+ assert len(heads) == len(pos)
+ doc = Doc(
+ tokens.vocab,
+ words=[t.text for t in tokens],
+ heads=[head + i for i, head in enumerate(heads)],
+ deps=deps,
+ pos=pos,
+ )
+
+ noun_chunks = list(doc.noun_chunks)
+ assert len(noun_chunks) == len(expected_noun_chunks)
+ for i, np in enumerate(noun_chunks):
+ assert np.text == expected_noun_chunks[i]
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 0e75b5f7a..a4a68ae8e 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -834,10 +834,12 @@ cdef class Tokenizer:
self.token_match = re.compile(data["token_match"]).match
if "url_match" in data and isinstance(data["url_match"], str):
self.url_match = re.compile(data["url_match"]).match
- if "rules" in data and isinstance(data["rules"], dict):
- self.rules = data["rules"]
if "faster_heuristics" in data:
self.faster_heuristics = data["faster_heuristics"]
+ # always load rules last so that all other settings are set before the
+ # internal tokenization for the phrase matcher
+ if "rules" in data and isinstance(data["rules"], dict):
+ self.rules = data["rules"]
return self
diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi
index b86203cc4..a5376e80d 100644
--- a/spacy/tokens/morphanalysis.pyi
+++ b/spacy/tokens/morphanalysis.pyi
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Iterator, List, Union
+from typing import Any, Dict, Iterator, List, Optional, Union
from ..vocab import Vocab
class MorphAnalysis:
@@ -13,7 +13,7 @@ class MorphAnalysis:
def __hash__(self) -> int: ...
def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
- def get(self, field: Any) -> List[str]: ...
+ def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ...
def to_json(self) -> str: ...
def to_dict(self) -> Dict[str, str]: ...
def __str__(self) -> str: ...
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index a7d1f2e44..baa3800a1 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -58,10 +58,14 @@ cdef class MorphAnalysis:
def __ne__(self, other):
return self.key != other.key
- def get(self, field):
+ def get(self, field, default=None):
"""Retrieve feature values by field."""
cdef attr_t field_id = self.vocab.strings.as_int(field)
cdef np.ndarray results = get_by_field(&self.c, field_id)
+ if len(results) == 0:
+ if default is None:
+ default = []
+ return default
features = [self.vocab.strings[result] for result in results]
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx
index 14e0916d1..d0971da55 100644
--- a/website/docs/api/dependencymatcher.mdx
+++ b/website/docs/api/dependencymatcher.mdx
@@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
come directly from
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
-| Symbol | Description |
-| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B` | `A` is the immediate dependent of `B`. |
-| `A > B` | `A` is the immediate head of `B`. |
-| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
-| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
-| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
-| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
-| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
-| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
-| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
-| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
-| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
-| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
-| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
-| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
-| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
-| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
-| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
+| Symbol | Description |
+| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| `A < B` | `A` is the immediate dependent of `B`. |
+| `A > B` | `A` is the immediate head of `B`. |
+| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
+| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
+| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
+| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. |
+| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. |
+| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. |
+| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
+| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
+| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
+| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
+| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
+| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
+| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. |
+| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. |
+| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
+| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
+| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. |
+| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. |
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx
index 68d80b814..5d4affafe 100644
--- a/website/docs/api/morphology.mdx
+++ b/website/docs/api/morphology.mdx
@@ -213,10 +213,11 @@ Retrieve values for a feature by field.
> assert morph.get("Feat1") == ["Val1", "Val2"]
> ```
-| Name | Description |
-| ----------- | ------------------------------------------------ |
-| `field` | The field to retrieve. ~~str~~ |
-| **RETURNS** | A list of the individual features. ~~List[str]~~ |
+| Name | Description |
+| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `field` | The field to retrieve. ~~str~~ |
+| `default` 3.6 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
+| **RETURNS** | A list of the individual features. ~~List[str]~~ |
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 7e88bdc1f..39be5f47b 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
come directly from
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
-| Symbol | Description |
-| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B` | `A` is the immediate dependent of `B`. |
-| `A > B` | `A` is the immediate head of `B`. |
-| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
-| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
-| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
-| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
-| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
-| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
-| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
-| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
-| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
-| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
-| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
-| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
-| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
-| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
-| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
+| Symbol | Description |
+| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| `A < B` | `A` is the immediate dependent of `B`. |
+| `A > B` | `A` is the immediate head of `B`. |
+| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
+| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
+| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
+| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. |
+| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. |
+| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. |
+| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
+| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
+| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
+| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
+| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
+| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
+| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. |
+| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. |
+| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
+| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
+| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. |
+| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. |
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 5fd1c2287..4067c4d1e 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,48 @@
{
"resources": [
+ {
+ "id": "spacy-wasm",
+ "title": "spacy-wasm",
+ "slogan": "spaCy in the browser using WebAssembly",
+ "description": "Run spaCy directly in the browser with WebAssembly. Using Pyodide, the application loads the spaCy model and renders the text prompt with displaCy.",
+ "url": "https://spacy-wasm.vercel.app/",
+ "github": "SyedAhkam/spacy-wasm",
+ "code_language": "python",
+ "author": "Syed Ahkam",
+ "author_links": {
+ "twitter": "@SyedAhkam1",
+ "github": "SyedAhkam"
+ },
+ "category": ["visualizers"],
+ "tags": ["visualization", "deployment"]
+ },
+ {
+ "id": "spacysee",
+ "title": "spaCysee",
+ "slogan": "Visualize spaCy's Dependency Parsing, POS tagging, and morphological analysis",
+ "description": "A project that helps you visualize your spaCy docs in Jupyter notebooks. Each of the dependency tags, POS tags and morphological features are clickable. Clicking on a tag will bring up the relevant documentation for that tag.",
+ "github": "moxley01/spacysee",
+ "pip": "spacysee",
+ "code_example": [
+ "import spacy",
+ "from spacysee import render",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "doc = nlp('This is a neat way to visualize your spaCy docs')",
+ "render(doc, width='500', height='500')"
+ ],
+ "code_language": "python",
+ "thumb": "https://www.mattoxley.com/static/images/spacysee_logo.svg",
+ "image": "https://www.mattoxley.com/static/images/spacysee_logo.svg",
+ "author": "Matt Oxley",
+ "author_links": {
+ "twitter": "matt0xley",
+ "github": "moxley01",
+ "website": "https://mattoxley.com"
+ },
+ "category": ["visualizers"],
+ "tags": ["visualization"]
+ },
{
"id": "grecy",
"title": "greCy",
@@ -1555,7 +1598,7 @@
"twitter": "allenai_org",
"website": "http://allenai.org"
},
- "category": ["scientific", "models", "research"]
+ "category": ["scientific", "models", "research", "biomedical"]
},
{
"id": "textacy",
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index 4c10e09c5..227b25be8 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -57,15 +57,9 @@ const AlertSpace = ({ nightly, legacy }) => {
)
}
-// const navAlert = (
-//
-// 💥 Out now: spaCy v3.5
-//
-// )
-
const navAlert = (
-
- 💥 Take the user survey!
+
+ 💥 Out now: spaCy v3.5
)