mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-22 01:51:58 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy into add-span-finder
This commit is contained in:
commit
3b41a988b0
118
.github/azure-steps.yml
vendored
118
.github/azure-steps.yml
vendored
|
@ -1,118 +0,0 @@
|
|||
parameters:
|
||||
python_version: ''
|
||||
architecture: 'x64'
|
||||
num_build_jobs: 2
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: ${{ parameters.python_version }}
|
||||
architecture: ${{ parameters.architecture }}
|
||||
allowUnstable: true
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||
displayName: 'Set variables'
|
||||
|
||||
- script: |
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install dependencies"
|
||||
|
||||
- script: |
|
||||
python -m build --sdist
|
||||
displayName: "Build sdist"
|
||||
|
||||
- script: |
|
||||
python -m mypy spacy
|
||||
displayName: 'Run mypy'
|
||||
condition: ne(variables['python_version'], '3.6')
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "spacy"
|
||||
displayName: "Delete source directory"
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "*.egg-info"
|
||||
displayName: "Delete egg-info directory"
|
||||
|
||||
- script: |
|
||||
python -m pip freeze > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
displayName: "Uninstall all packages"
|
||||
|
||||
- bash: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
||||
displayName: "Install from sdist"
|
||||
|
||||
- script: |
|
||||
python -W error -c "import spacy"
|
||||
displayName: "Test import"
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
displayName: 'Test download_url in info CLI'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
displayName: 'Test no warnings on load (#11713)'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
displayName: 'Test debug config CLI'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
displayName: 'Test debug data CLI'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install test requirements"
|
||||
|
||||
- script: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
displayName: "Run CPU tests"
|
||||
|
||||
- script: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
|
@ -1,120 +0,0 @@
|
|||
trigger:
|
||||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- "*"
|
||||
exclude:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths:
|
||||
exclude:
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- ".github/workflows/*"
|
||||
pr:
|
||||
paths:
|
||||
exclude:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/docs/*"
|
||||
- "website/src/*"
|
||||
- "website/meta/*.tsx"
|
||||
- "website/meta/*.mjs"
|
||||
- "website/meta/languages.json"
|
||||
- "website/meta/site.json"
|
||||
- "website/meta/sidebars.json"
|
||||
- "website/meta/type-annotations.json"
|
||||
- "website/pages/*"
|
||||
- ".github/workflows/*"
|
||||
|
||||
jobs:
|
||||
# Check formatting and linting. Perform basic checks for most important errors
|
||||
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
|
||||
# selected codes.
|
||||
- job: "Validate"
|
||||
pool:
|
||||
vmImage: "ubuntu-latest"
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: "3.7"
|
||||
- script: |
|
||||
pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
displayName: "black"
|
||||
- script: |
|
||||
pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
displayName: "flake8"
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
displayName: 'Validate website/meta/universe.json'
|
||||
|
||||
- job: "Test"
|
||||
dependsOn: "Validate"
|
||||
strategy:
|
||||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-20.04"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.6"
|
||||
# Python36Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-20.04"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.7"
|
||||
# Python37Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.7"
|
||||
# Python38Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.8"
|
||||
# Python38Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.8"
|
||||
Python38Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.8"
|
||||
Python39Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.9"
|
||||
# Python39Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.9"
|
||||
# Python39Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.9"
|
||||
# Python310Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.10"
|
||||
Python310Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.10"
|
||||
Python311Linux:
|
||||
imageName: 'ubuntu-latest'
|
||||
python.version: '3.11'
|
||||
Python311Windows:
|
||||
imageName: 'windows-latest'
|
||||
python.version: '3.11'
|
||||
Python311Mac:
|
||||
imageName: 'macos-latest'
|
||||
python.version: '3.11'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
steps:
|
||||
- template: .github/azure-steps.yml
|
||||
parameters:
|
||||
python_version: '$(python.version)'
|
|
@ -337,7 +337,7 @@ def debug_data(
|
|||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.good("Examples without ocurrences available for all labels")
|
||||
msg.good("Examples without occurrences available for all labels")
|
||||
|
||||
if "ner" in factory_names:
|
||||
# Get all unique NER labels present in the data
|
||||
|
|
|
@ -2,12 +2,14 @@ from ...language import Language, BaseDefaults
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class LatinDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Latin(Language):
|
||||
|
|
22
spacy/lang/la/examples.py
Normal file
22
spacy/lang/la/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.la.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
# > Caes. BG 1.1
|
||||
# > Cic. De Amic. 1
|
||||
# > V. Georg. 1.1-5
|
||||
# > Gen. 1:1
|
||||
# > Galileo, Sid. Nunc.
|
||||
# > van Schurman, Opusc. arg. 1
|
||||
|
||||
sentences = [
|
||||
"Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.",
|
||||
"Q. Mucius augur multa narrare de C. Laelio socero suo memoriter et iucunde solebat nec dubitare illum in omni sermone appellare sapientem.",
|
||||
"Quid faciat laetas segetes, quo sidere terram uertere, Maecenas, ulmisque adiungere uitis conueniat, quae cura boum, qui cultus habendo sit pecori, apibus quanta experientia parcis, hinc canere incipiam",
|
||||
"In principio creavit Deus caelum et terram.",
|
||||
"Quo sumpto, intelligatur lunaris globus, cuius maximus circulus CAF, centrum vero E, dimetiens CF, qui ad Terre diametrum est ut duo ad septem.",
|
||||
"Cuicunque natura indita sunt principia, seu potentiae principiorum omnium artium, ac scientiarum, ei conveniunt omnes artes ac scientiae.",
|
||||
]
|
|
@ -6,17 +6,16 @@ roman_numerals_compile = re.compile(
|
|||
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
||||
)
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
|
||||
_num_words = """unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem undecim duodecim tredecim quattuordecim quindecim sedecim septendecim duodeviginti undeviginti viginti triginta quadraginta quinquaginta sexaginta septuaginta octoginta nonaginta centum ducenti ducentae ducenta trecenti trecentae trecenta quadringenti quadringentae quadringenta quingenti quingentae quingenta sescenti sescentae sescenta septingenti septingentae septingenta octingenti octingentae octingenta nongenti nongentae nongenta mille
|
||||
""".split()
|
||||
)
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
primus prima primum secundus secunda secundum tertius tertia tertium
|
||||
""".split()
|
||||
)
|
||||
_num_words += [item.replace("v", "u") for item in _num_words]
|
||||
_num_words = set(_num_words)
|
||||
|
||||
_ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split()
|
||||
|
||||
_ordinal_words += [item.replace("v", "u") for item in _ordinal_words]
|
||||
_ordinal_words = set(_ordinal_words)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
85
spacy/lang/la/syntax_iterators.py
Normal file
85
spacy/lang/la/syntax_iterators.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
from ...tokens import Doc, Span
|
||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||
from ...errors import Errors
|
||||
|
||||
# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB]
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
def is_verb_token(tok):
|
||||
return tok.pos in [VERB, AUX]
|
||||
|
||||
def get_left_bound(root):
|
||||
left_bound = root
|
||||
for tok in reversed(list(root.lefts)):
|
||||
if tok.dep in np_left_deps:
|
||||
left_bound = tok
|
||||
return left_bound
|
||||
|
||||
def get_right_bound(doc, root):
|
||||
right_bound = root
|
||||
for tok in root.rights:
|
||||
if tok.dep in np_right_deps:
|
||||
right = get_right_bound(doc, tok)
|
||||
if list(
|
||||
filter(
|
||||
lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||
doc[root.i : right.i],
|
||||
)
|
||||
):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
return right_bound
|
||||
|
||||
def get_bounds(doc, root):
|
||||
return get_left_bound(root), get_right_bound(doc, root)
|
||||
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
if not len(doc):
|
||||
return
|
||||
|
||||
left_labels = [
|
||||
"det",
|
||||
"fixed",
|
||||
"nmod:poss",
|
||||
"amod",
|
||||
"flat",
|
||||
"goeswith",
|
||||
"nummod",
|
||||
"appos",
|
||||
]
|
||||
right_labels = [
|
||||
"fixed",
|
||||
"nmod:poss",
|
||||
"amod",
|
||||
"flat",
|
||||
"goeswith",
|
||||
"nummod",
|
||||
"appos",
|
||||
"nmod",
|
||||
"det",
|
||||
]
|
||||
stop_labels = ["punct"]
|
||||
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||
|
||||
prev_right = -1
|
||||
for token in doclike:
|
||||
if token.pos in [PROPN, NOUN, PRON]:
|
||||
left, right = get_bounds(doc, token)
|
||||
if left.i <= prev_right:
|
||||
continue
|
||||
yield left.i, right.i + 1, np_label
|
||||
prev_right = right.i
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -12,65 +12,15 @@ _exc = {
|
|||
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
||||
}
|
||||
|
||||
for orth in [
|
||||
"A.",
|
||||
"Agr.",
|
||||
"Ap.",
|
||||
"C.",
|
||||
"Cn.",
|
||||
"D.",
|
||||
"F.",
|
||||
"K.",
|
||||
"L.",
|
||||
"M'.",
|
||||
"M.",
|
||||
"Mam.",
|
||||
"N.",
|
||||
"Oct.",
|
||||
"Opet.",
|
||||
"P.",
|
||||
"Paul.",
|
||||
"Post.",
|
||||
"Pro.",
|
||||
"Q.",
|
||||
"S.",
|
||||
"Ser.",
|
||||
"Sert.",
|
||||
"Sex.",
|
||||
"St.",
|
||||
"Sta.",
|
||||
"T.",
|
||||
"Ti.",
|
||||
"V.",
|
||||
"Vol.",
|
||||
"Vop.",
|
||||
"U.",
|
||||
"Uol.",
|
||||
"Uop.",
|
||||
"Ian.",
|
||||
"Febr.",
|
||||
"Mart.",
|
||||
"Apr.",
|
||||
"Mai.",
|
||||
"Iun.",
|
||||
"Iul.",
|
||||
"Aug.",
|
||||
"Sept.",
|
||||
"Oct.",
|
||||
"Nov.",
|
||||
"Nou.",
|
||||
"Dec.",
|
||||
"Non.",
|
||||
"Id.",
|
||||
"A.D.",
|
||||
"Coll.",
|
||||
"Cos.",
|
||||
"Ord.",
|
||||
"Pl.",
|
||||
"S.C.",
|
||||
"Suff.",
|
||||
"Trib.",
|
||||
]:
|
||||
_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split()
|
||||
|
||||
_abbrev_exc += [item.lower() for item in _abbrev_exc]
|
||||
_abbrev_exc += [item.upper() for item in _abbrev_exc]
|
||||
_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc]
|
||||
|
||||
_abbrev_exc += ["d.N."]
|
||||
|
||||
for orth in set(_abbrev_exc):
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -432,22 +432,22 @@ cdef class DependencyMatcher:
|
|||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||
|
||||
def _imm_right_child(self, doc, node):
|
||||
for child in doc[node].children:
|
||||
for child in doc[node].rights:
|
||||
if child.i == node + 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _imm_left_child(self, doc, node):
|
||||
for child in doc[node].children:
|
||||
for child in doc[node].lefts:
|
||||
if child.i == node - 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _right_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
||||
return [child for child in doc[node].rights]
|
||||
|
||||
def _left_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
||||
return [child for child in doc[node].lefts]
|
||||
|
||||
def _imm_right_parent(self, doc, node):
|
||||
if doc[node].head.i == node + 1:
|
||||
|
|
|
@ -33,6 +33,8 @@ def test_token_morph_key(i_has):
|
|||
def test_morph_props(i_has):
|
||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||
assert i_has[1].morph.get("PronType") == []
|
||||
assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"]
|
||||
assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"]
|
||||
|
||||
|
||||
def test_morph_iter(i_has):
|
||||
|
|
52
spacy/tests/lang/la/test_noun_chunks.py
Normal file
52
spacy/tests/lang/la/test_noun_chunks.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed(la_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'la' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = la_tokenizer("Haec est sententia.")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
||||
|
||||
LA_NP_TEST_EXAMPLES = [
|
||||
(
|
||||
"Haec narrantur a poetis de Perseo.",
|
||||
["DET", "VERB", "ADP", "NOUN", "ADP", "PROPN", "PUNCT"],
|
||||
["nsubj:pass", "ROOT", "case", "obl", "case", "obl", "punct"],
|
||||
[1, 0, -1, -1, -3, -1, -5],
|
||||
["poetis", "Perseo"],
|
||||
),
|
||||
(
|
||||
"Perseus autem in sinu matris dormiebat.",
|
||||
["NOUN", "ADV", "ADP", "NOUN", "NOUN", "VERB", "PUNCT"],
|
||||
["nsubj", "discourse", "case", "obl", "nmod", "ROOT", "punct"],
|
||||
[5, 4, 3, -1, -1, 0, -1],
|
||||
["Perseus", "sinu matris"],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,pos,deps,heads,expected_noun_chunks", LA_NP_TEST_EXAMPLES
|
||||
)
|
||||
def test_la_noun_chunks(la_tokenizer, text, pos, deps, heads, expected_noun_chunks):
|
||||
tokens = la_tokenizer(text)
|
||||
|
||||
assert len(heads) == len(pos)
|
||||
doc = Doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=[head + i for i, head in enumerate(heads)],
|
||||
deps=deps,
|
||||
pos=pos,
|
||||
)
|
||||
|
||||
noun_chunks = list(doc.noun_chunks)
|
||||
assert len(noun_chunks) == len(expected_noun_chunks)
|
||||
for i, np in enumerate(noun_chunks):
|
||||
assert np.text == expected_noun_chunks[i]
|
|
@ -834,10 +834,12 @@ cdef class Tokenizer:
|
|||
self.token_match = re.compile(data["token_match"]).match
|
||||
if "url_match" in data and isinstance(data["url_match"], str):
|
||||
self.url_match = re.compile(data["url_match"]).match
|
||||
if "rules" in data and isinstance(data["rules"], dict):
|
||||
self.rules = data["rules"]
|
||||
if "faster_heuristics" in data:
|
||||
self.faster_heuristics = data["faster_heuristics"]
|
||||
# always load rules last so that all other settings are set before the
|
||||
# internal tokenization for the phrase matcher
|
||||
if "rules" in data and isinstance(data["rules"], dict):
|
||||
self.rules = data["rules"]
|
||||
return self
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Any, Dict, Iterator, List, Union
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
from ..vocab import Vocab
|
||||
|
||||
class MorphAnalysis:
|
||||
|
@ -13,7 +13,7 @@ class MorphAnalysis:
|
|||
def __hash__(self) -> int: ...
|
||||
def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
||||
def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
||||
def get(self, field: Any) -> List[str]: ...
|
||||
def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ...
|
||||
def to_json(self) -> str: ...
|
||||
def to_dict(self) -> Dict[str, str]: ...
|
||||
def __str__(self) -> str: ...
|
||||
|
|
|
@ -58,10 +58,14 @@ cdef class MorphAnalysis:
|
|||
def __ne__(self, other):
|
||||
return self.key != other.key
|
||||
|
||||
def get(self, field):
|
||||
def get(self, field, default=None):
|
||||
"""Retrieve feature values by field."""
|
||||
cdef attr_t field_id = self.vocab.strings.as_int(field)
|
||||
cdef np.ndarray results = get_by_field(&self.c, field_id)
|
||||
if len(results) == 0:
|
||||
if default is None:
|
||||
default = []
|
||||
return default
|
||||
features = [self.vocab.strings[result] for result in results]
|
||||
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
|
||||
|
||||
|
|
|
@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
|
|
|
@ -213,10 +213,11 @@ Retrieve values for a feature by field.
|
|||
> assert morph.get("Feat1") == ["Val1", "Val2"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| `field` | The field to retrieve. ~~str~~ |
|
||||
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
||||
| Name | Description |
|
||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `field` | The field to retrieve. ~~str~~ |
|
||||
| `default` <Tag variant="new">3.6</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
|
||||
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
||||
|
||||
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
|
||||
|
||||
|
|
|
@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. |
|
||||
|
||||
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
||||
|
||||
|
|
|
@ -1,5 +1,48 @@
|
|||
{
|
||||
"resources": [
|
||||
{
|
||||
"id": "spacy-wasm",
|
||||
"title": "spacy-wasm",
|
||||
"slogan": "spaCy in the browser using WebAssembly",
|
||||
"description": "Run spaCy directly in the browser with WebAssembly. Using Pyodide, the application loads the spaCy model and renders the text prompt with displaCy.",
|
||||
"url": "https://spacy-wasm.vercel.app/",
|
||||
"github": "SyedAhkam/spacy-wasm",
|
||||
"code_language": "python",
|
||||
"author": "Syed Ahkam",
|
||||
"author_links": {
|
||||
"twitter": "@SyedAhkam1",
|
||||
"github": "SyedAhkam"
|
||||
},
|
||||
"category": ["visualizers"],
|
||||
"tags": ["visualization", "deployment"]
|
||||
},
|
||||
{
|
||||
"id": "spacysee",
|
||||
"title": "spaCysee",
|
||||
"slogan": "Visualize spaCy's Dependency Parsing, POS tagging, and morphological analysis",
|
||||
"description": "A project that helps you visualize your spaCy docs in Jupyter notebooks. Each of the dependency tags, POS tags and morphological features are clickable. Clicking on a tag will bring up the relevant documentation for that tag.",
|
||||
"github": "moxley01/spacysee",
|
||||
"pip": "spacysee",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"from spacysee import render",
|
||||
"",
|
||||
"nlp = spacy.load('en_core_web_sm')",
|
||||
"doc = nlp('This is a neat way to visualize your spaCy docs')",
|
||||
"render(doc, width='500', height='500')"
|
||||
],
|
||||
"code_language": "python",
|
||||
"thumb": "https://www.mattoxley.com/static/images/spacysee_logo.svg",
|
||||
"image": "https://www.mattoxley.com/static/images/spacysee_logo.svg",
|
||||
"author": "Matt Oxley",
|
||||
"author_links": {
|
||||
"twitter": "matt0xley",
|
||||
"github": "moxley01",
|
||||
"website": "https://mattoxley.com"
|
||||
},
|
||||
"category": ["visualizers"],
|
||||
"tags": ["visualization"]
|
||||
},
|
||||
{
|
||||
"id": "grecy",
|
||||
"title": "greCy",
|
||||
|
@ -1555,7 +1598,7 @@
|
|||
"twitter": "allenai_org",
|
||||
"website": "http://allenai.org"
|
||||
},
|
||||
"category": ["scientific", "models", "research"]
|
||||
"category": ["scientific", "models", "research", "biomedical"]
|
||||
},
|
||||
{
|
||||
"id": "textacy",
|
||||
|
|
|
@ -57,15 +57,9 @@ const AlertSpace = ({ nightly, legacy }) => {
|
|||
)
|
||||
}
|
||||
|
||||
// const navAlert = (
|
||||
// <Link to="/usage/v3-5" noLinkLayout>
|
||||
// <strong>💥 Out now:</strong> spaCy v3.5
|
||||
// </Link>
|
||||
// )
|
||||
|
||||
const navAlert = (
|
||||
<Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
|
||||
<strong>💥 Take the user survey!</strong>
|
||||
<Link to="/usage/v3-5" noLinkLayout>
|
||||
<strong>💥 Out now:</strong> spaCy v3.5
|
||||
</Link>
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user