mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge pull request #11375 from adrianeboyd/chore/update-develop-from-master-v3.5-1
Update develop from master for v3.5
This commit is contained in:
commit
6fd3b4d9d6
4
.github/azure-steps.yml
vendored
4
.github/azure-steps.yml
vendored
|
@ -54,12 +54,12 @@ steps:
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
condition: eq(${{ parameters.gpu }}, true)
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
|
||||||
displayName: "Run CPU tests"
|
displayName: "Run CPU tests"
|
||||||
condition: eq(${{ parameters.gpu }}, false)
|
condition: eq(${{ parameters.gpu }}, false)
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
|
||||||
displayName: "Run GPU tests"
|
displayName: "Run GPU tests"
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
condition: eq(${{ parameters.gpu }}, true)
|
||||||
|
|
||||||
|
|
13
.github/no-response.yml
vendored
13
.github/no-response.yml
vendored
|
@ -1,13 +0,0 @@
|
||||||
# Configuration for probot-no-response - https://github.com/probot/no-response
|
|
||||||
|
|
||||||
# Number of days of inactivity before an Issue is closed for lack of response
|
|
||||||
daysUntilClose: 14
|
|
||||||
# Label requiring a response
|
|
||||||
responseRequiredLabel: more-info-needed
|
|
||||||
# Comment to post when closing an Issue for lack of response. Set to `false` to disable
|
|
||||||
closeComment: >
|
|
||||||
This issue has been automatically closed because there has been no response
|
|
||||||
to a request for more information from the original author. With only the
|
|
||||||
information that is currently in the issue, there's not enough information
|
|
||||||
to take action. If you're the original author, feel free to reopen the issue
|
|
||||||
if you have or find the answers needed to investigate further.
|
|
8
.github/workflows/issue-manager.yml
vendored
8
.github/workflows/issue-manager.yml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
||||||
issue-manager:
|
issue-manager:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: tiangolo/issue-manager@0.2.1
|
- uses: tiangolo/issue-manager@0.4.0
|
||||||
with:
|
with:
|
||||||
token: ${{ secrets.GITHUB_TOKEN }}
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
config: >
|
config: >
|
||||||
|
@ -25,5 +25,11 @@ jobs:
|
||||||
"message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
|
"message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
|
||||||
"remove_label_on_comment": true,
|
"remove_label_on_comment": true,
|
||||||
"remove_label_on_close": true
|
"remove_label_on_close": true
|
||||||
|
},
|
||||||
|
"more-info-needed": {
|
||||||
|
"delay": "P7D",
|
||||||
|
"message": "This issue has been automatically closed because there has been no response to a request for more information from the original author. With only the information that is currently in the issue, there's not enough information to take action. If you're the original author, feel free to reopen the issue if you have or find the answers needed to investigate further.",
|
||||||
|
"remove_label_on_comment": true,
|
||||||
|
"remove_label_on_close": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@ jobs:
|
||||||
versionSpec: "3.7"
|
versionSpec: "3.7"
|
||||||
- script: |
|
- script: |
|
||||||
pip install flake8==3.9.2
|
pip install flake8==3.9.2
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
|
||||||
- job: "Test"
|
- job: "Test"
|
||||||
|
|
|
@ -191,6 +191,8 @@ def load_model(name: str) -> "Language":
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note that we typically put the `from typing` import statements on the first line(s) of the Python module.
|
||||||
|
|
||||||
## Structuring logic
|
## Structuring logic
|
||||||
|
|
||||||
### Positional and keyword arguments
|
### Positional and keyword arguments
|
||||||
|
@ -275,6 +277,27 @@ If you have to use `try`/`except`, make sure to only include what's **absolutely
|
||||||
+ return [v.strip() for v in value.split(",")]
|
+ return [v.strip() for v in value.split(",")]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Numeric comparisons
|
||||||
|
|
||||||
|
For numeric comparisons, as a general rule we always use `<` and `>=` and avoid the usage of `<=` and `>`. This is to ensure we consistently
|
||||||
|
apply inclusive lower bounds and exclusive upper bounds, helping to prevent off-by-one errors.
|
||||||
|
|
||||||
|
One exception to this rule is the ternary case. With a chain like
|
||||||
|
|
||||||
|
```python
|
||||||
|
if value >= 0 and value < max:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
it's fine to rewrite this to the shorter form
|
||||||
|
|
||||||
|
```python
|
||||||
|
if 0 <= value < max:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
even though this requires the usage of the `<=` operator.
|
||||||
|
|
||||||
### Iteration and comprehensions
|
### Iteration and comprehensions
|
||||||
|
|
||||||
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
|
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
|
||||||
|
@ -451,7 +474,7 @@ spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests f
|
||||||
|
|
||||||
When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`.
|
When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`.
|
||||||
|
|
||||||
Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first.
|
Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first.
|
||||||
|
|
||||||
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
|
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
|
||||||
|
|
||||||
|
|
|
@ -123,7 +123,8 @@ def app(environ, start_response):
|
||||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||||
|
|
||||||
doc (Doc): Document do parse.
|
orig_doc (Doc): Document to parse.
|
||||||
|
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(
|
doc = Doc(orig_doc.vocab).from_bytes(
|
||||||
|
@ -209,7 +210,7 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
|
|
||||||
|
|
||||||
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
"""Generate spans in [{start: i, end: i, label: 'label'}] format.
|
"""Generate spans in [{start_token: i, end_token: i, label: 'label'}] format.
|
||||||
|
|
||||||
doc (Doc): Document to parse.
|
doc (Doc): Document to parse.
|
||||||
options (Dict[str, any]): Span-specific visualisation options.
|
options (Dict[str, any]): Span-specific visualisation options.
|
||||||
|
|
|
@ -16,8 +16,8 @@ def setup_default_warnings():
|
||||||
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
|
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
|
||||||
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
|
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
|
||||||
|
|
||||||
# warn about entity_ruler & matcher having no patterns only once
|
# warn about entity_ruler, span_ruler & matcher having no patterns only once
|
||||||
for pipe in ["matcher", "entity_ruler"]:
|
for pipe in ["matcher", "entity_ruler", "span_ruler"]:
|
||||||
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||||
|
|
||||||
# warn once about lemmatizer without required POS
|
# warn once about lemmatizer without required POS
|
||||||
|
@ -389,7 +389,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"consider using doc.spans instead.")
|
"consider using doc.spans instead.")
|
||||||
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
|
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
|
||||||
"settings: {opts}")
|
"settings: {opts}")
|
||||||
E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
|
E107 = ("Value of custom attribute `{attr}` is not JSON-serializable: {value}")
|
||||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||||
"call `initialize()`?")
|
"call `initialize()`?")
|
||||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||||
|
@ -535,11 +535,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||||
"table, which contains {n_rows} vectors.")
|
"table, which contains {n_rows} vectors.")
|
||||||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||||
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
E200 = ("Can't set {attr} from Span.")
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
|
||||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||||
|
"not permitted in factory names.")
|
||||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||||
"permit overlapping spans.")
|
"permit overlapping spans.")
|
||||||
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
|
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
|
||||||
|
|
|
@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
["·", "ㆍ", "\(", "\)"]
|
["·", "ㆍ", r"\(", r"\)"]
|
||||||
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
+ BASE_TOKENIZER_INFIXES
|
+ BASE_TOKENIZER_INFIXES
|
||||||
|
|
18
spacy/lang/lg/__init__.py
Normal file
18
spacy/lang/lg/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
class LugandaDefaults(BaseDefaults):
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Luganda(Language):
|
||||||
|
lang = "lg"
|
||||||
|
Defaults = LugandaDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Luganda"]
|
17
spacy/lang/lg/examples.py
Normal file
17
spacy/lang/lg/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.lg.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Mpa ebyafaayo ku byalo Nakatu ne Nkajja",
|
||||||
|
"Okuyita Ttembo kitegeeza kugwa ddalu",
|
||||||
|
"Ekifumu kino kyali kya mulimu ki?",
|
||||||
|
"Ekkovu we liyise wayitibwa mukululo",
|
||||||
|
"Akola mulimu ki oguvaamu ssente?",
|
||||||
|
"Emisumaali egikomerera embaawo giyitibwa nninga",
|
||||||
|
"Abooluganda ab’emmamba ababiri",
|
||||||
|
"Ekisaawe ky'ebyenjigiriza kya mugaso nnyo",
|
||||||
|
]
|
95
spacy/lang/lg/lex_attrs.py
Normal file
95
spacy/lang/lg/lex_attrs.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"nnooti", # Zero
|
||||||
|
"zeero", # zero
|
||||||
|
"emu", # one
|
||||||
|
"bbiri", # two
|
||||||
|
"ssatu", # three
|
||||||
|
"nnya", # four
|
||||||
|
"ttaano", # five
|
||||||
|
"mukaaga", # six
|
||||||
|
"musanvu", # seven
|
||||||
|
"munaana", # eight
|
||||||
|
"mwenda", # nine
|
||||||
|
"kkumi", # ten
|
||||||
|
"kkumi n'emu", # eleven
|
||||||
|
"kkumi na bbiri", # twelve
|
||||||
|
"kkumi na ssatu", # thirteen
|
||||||
|
"kkumi na nnya", # forteen
|
||||||
|
"kkumi na ttaano", # fifteen
|
||||||
|
"kkumi na mukaaga", # sixteen
|
||||||
|
"kkumi na musanvu", # seventeen
|
||||||
|
"kkumi na munaana", # eighteen
|
||||||
|
"kkumi na mwenda", # nineteen
|
||||||
|
"amakumi abiri", # twenty
|
||||||
|
"amakumi asatu", # thirty
|
||||||
|
"amakumi ana", # forty
|
||||||
|
"amakumi ataano", # fifty
|
||||||
|
"nkaaga", # sixty
|
||||||
|
"nsanvu", # seventy
|
||||||
|
"kinaana", # eighty
|
||||||
|
"kyenda", # ninety
|
||||||
|
"kikumi", # hundred
|
||||||
|
"lukumi", # thousand
|
||||||
|
"kakadde", # million
|
||||||
|
"kawumbi", # billion
|
||||||
|
"kase", # trillion
|
||||||
|
"katabalika", # quadrillion
|
||||||
|
"keesedde", # gajillion
|
||||||
|
"kafukunya", # bazillion
|
||||||
|
"ekisooka", # first
|
||||||
|
"ekyokubiri", # second
|
||||||
|
"ekyokusatu", # third
|
||||||
|
"ekyokuna", # fourth
|
||||||
|
"ekyokutaano", # fifith
|
||||||
|
"ekyomukaaga", # sixth
|
||||||
|
"ekyomusanvu", # seventh
|
||||||
|
"eky'omunaana", # eighth
|
||||||
|
"ekyomwenda", # nineth
|
||||||
|
"ekyekkumi", # tenth
|
||||||
|
"ekyekkumi n'ekimu", # eleventh
|
||||||
|
"ekyekkumi n'ebibiri", # twelveth
|
||||||
|
"ekyekkumi n'ebisatu", # thirteenth
|
||||||
|
"ekyekkumi n'ebina", # fourteenth
|
||||||
|
"ekyekkumi n'ebitaano", # fifteenth
|
||||||
|
"ekyekkumi n'omukaaga", # sixteenth
|
||||||
|
"ekyekkumi n'omusanvu", # seventeenth
|
||||||
|
"ekyekkumi n'omunaana", # eigteenth
|
||||||
|
"ekyekkumi n'omwenda", # nineteenth
|
||||||
|
"ekyamakumi abiri", # twentieth
|
||||||
|
"ekyamakumi asatu", # thirtieth
|
||||||
|
"ekyamakumi ana", # fortieth
|
||||||
|
"ekyamakumi ataano", # fiftieth
|
||||||
|
"ekyenkaaga", # sixtieth
|
||||||
|
"ekyensanvu", # seventieth
|
||||||
|
"ekyekinaana", # eightieth
|
||||||
|
"ekyekyenda", # ninetieth
|
||||||
|
"ekyekikumi", # hundredth
|
||||||
|
"ekyolukumi", # thousandth
|
||||||
|
"ekyakakadde", # millionth
|
||||||
|
"ekyakawumbi", # billionth
|
||||||
|
"ekyakase", # trillionth
|
||||||
|
"ekyakatabalika", # quadrillionth
|
||||||
|
"ekyakeesedde", # gajillionth
|
||||||
|
"ekyakafukunya", # bazillionth
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/lg/punctuation.py
Normal file
19
spacy/lang/lg/punctuation.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
19
spacy/lang/lg/stop_words.py
Normal file
19
spacy/lang/lg/stop_words.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
|
||||||
|
atya awamu aweebwa ayinza ba baali babadde babalina bajja
|
||||||
|
bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
|
||||||
|
bimu bingi bino bo bokka bonna buli bulijjo bulungi bwabwe bwaffe bwayo bwe bwonna bya byabwe
|
||||||
|
byaffe byebimu byonna ddaa ddala ddi e ebimu ebiri ebweruobulungi ebyo edda ejja ekirala ekyo
|
||||||
|
endala engeri ennyo era erimu erina ffe ffenna ga gujja gumu gunno guno gwa gwe kaseera kati
|
||||||
|
kennyini ki kiki kikino kikye kikyo kino kirungi kki ku kubangabyombi kubangaolwokuba kudda
|
||||||
|
kuva kuwa kwegamba kyaffe kye kyekimuoyo kyekyo kyonna leero liryo lwa lwaki lyabwezaabwe
|
||||||
|
lyaffe lyange mbadde mingi mpozzi mu mulinaoyina munda mwegyabwe nolwekyo nabadde nabo nandiyagadde
|
||||||
|
nandiye nanti naye ne nedda neera nga nnyingi nnyini nnyinza nnyo nti nyinza nze oba ojja okudda
|
||||||
|
okugenda okuggyako okutuusa okuva okuwa oli olina oluvannyuma olwekyobuva omuli ono osobola otya
|
||||||
|
oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina terina tetulina
|
||||||
|
tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
|
||||||
|
wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
|
||||||
|
ye yenna yennyini yina yonna ziba zijja zonna
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
span_label = doc.vocab.strings.add("NP")
|
span_label = doc.vocab.strings.add("NP")
|
||||||
|
|
||||||
# Only NOUNS and PRONOUNS matter
|
# Only NOUNS and PRONOUNS matter
|
||||||
|
end_span = -1
|
||||||
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
||||||
# For NOUNS
|
# For NOUNS
|
||||||
# Pick children from syntactic parse (only those with certain dependencies)
|
# Pick children from syntactic parse (only those with certain dependencies)
|
||||||
|
@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
children_i = [c.i for c in children] + [word.i]
|
children_i = [c.i for c in children] + [word.i]
|
||||||
|
|
||||||
start_span = min(children_i)
|
start_span = min(children_i)
|
||||||
end_span = max(children_i) + 1
|
if start_span >= end_span:
|
||||||
yield start_span, end_span, span_label
|
end_span = max(children_i) + 1
|
||||||
|
yield start_span, end_span, span_label
|
||||||
|
|
||||||
# PRONOUNS only if it is the subject of a verb
|
# PRONOUNS only if it is the subject of a verb
|
||||||
elif word.pos == PRON:
|
elif word.pos == PRON:
|
||||||
if word.dep in pronoun_deps:
|
if word.dep in pronoun_deps:
|
||||||
start_span = word.i
|
start_span = word.i
|
||||||
end_span = word.i + 1
|
if start_span >= end_span:
|
||||||
yield start_span, end_span, span_label
|
end_span = word.i + 1
|
||||||
|
yield start_span, end_span, span_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
|
|
|
@ -465,6 +465,8 @@ class Language:
|
||||||
"""
|
"""
|
||||||
if not isinstance(name, str):
|
if not isinstance(name, str):
|
||||||
raise ValueError(Errors.E963.format(decorator="factory"))
|
raise ValueError(Errors.E963.format(decorator="factory"))
|
||||||
|
if "." in name:
|
||||||
|
raise ValueError(Errors.E853.format(name=name))
|
||||||
if not isinstance(default_config, dict):
|
if not isinstance(default_config, dict):
|
||||||
err = Errors.E962.format(
|
err = Errors.E962.format(
|
||||||
style="default config", name=name, cfg_type=type(default_config)
|
style="default config", name=name, cfg_type=type(default_config)
|
||||||
|
@ -543,8 +545,11 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#component
|
DOCS: https://spacy.io/api/language#component
|
||||||
"""
|
"""
|
||||||
if name is not None and not isinstance(name, str):
|
if name is not None:
|
||||||
raise ValueError(Errors.E963.format(decorator="component"))
|
if not isinstance(name, str):
|
||||||
|
raise ValueError(Errors.E963.format(decorator="component"))
|
||||||
|
if "." in name:
|
||||||
|
raise ValueError(Errors.E853.format(name=name))
|
||||||
component_name = name if name is not None else util.get_object_name(func)
|
component_name = name if name is not None else util.get_object_name(func)
|
||||||
|
|
||||||
def add_component(component_func: "Pipe") -> Callable:
|
def add_component(component_func: "Pipe") -> Callable:
|
||||||
|
|
|
@ -207,7 +207,7 @@ class TokenPatternOperatorSimple(str, Enum):
|
||||||
|
|
||||||
|
|
||||||
class TokenPatternOperatorMinMax(ConstrainedStr):
|
class TokenPatternOperatorMinMax(ConstrainedStr):
|
||||||
regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
|
regex = re.compile(r"^({\d+}|{\d+,\d*}|{\d*,\d+})$")
|
||||||
|
|
||||||
|
|
||||||
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
|
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
|
||||||
|
@ -514,6 +514,14 @@ class DocJSONSchema(BaseModel):
|
||||||
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
||||||
..., title="Token information - ID, start, annotations"
|
..., title="Token information - ID, start, annotations"
|
||||||
)
|
)
|
||||||
_: Optional[Dict[StrictStr, Any]] = Field(
|
underscore_doc: Optional[Dict[StrictStr, Any]] = Field(
|
||||||
None, title="Any custom data stored in the document's _ attribute"
|
None,
|
||||||
|
title="Any custom data stored in the document's _ attribute",
|
||||||
|
alias="_",
|
||||||
|
)
|
||||||
|
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
||||||
|
None, title="Any custom data stored in the token's _ attribute"
|
||||||
|
)
|
||||||
|
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
||||||
|
None, title="Any custom data stored in the span's _ attribute"
|
||||||
)
|
)
|
||||||
|
|
|
@ -261,6 +261,11 @@ def lb_tokenizer():
|
||||||
return get_lang_class("lb")().tokenizer
|
return get_lang_class("lb")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def lg_tokenizer():
|
||||||
|
return get_lang_class("lg")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lt_tokenizer():
|
def lt_tokenizer():
|
||||||
return get_lang_class("lt")().tokenizer
|
return get_lang_class("lt")().tokenizer
|
||||||
|
|
|
@ -3,6 +3,7 @@ import weakref
|
||||||
import numpy
|
import numpy
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
import pytest
|
import pytest
|
||||||
|
import warnings
|
||||||
from thinc.api import NumpyOps, get_current_ops
|
from thinc.api import NumpyOps, get_current_ops
|
||||||
|
|
||||||
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
||||||
|
@ -529,9 +530,9 @@ def test_doc_from_array_sent_starts(en_vocab):
|
||||||
# no warning using default attrs
|
# no warning using default attrs
|
||||||
attrs = doc._get_array_attrs()
|
attrs = doc._get_array_attrs()
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert len(record) == 0
|
|
||||||
# only SENT_START uses SENT_START
|
# only SENT_START uses SENT_START
|
||||||
attrs = [SENT_START]
|
attrs = [SENT_START]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
import pytest
|
import pytest
|
||||||
import spacy
|
import spacy
|
||||||
from spacy import schemas
|
from spacy import schemas
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span, Token
|
||||||
|
import srsly
|
||||||
|
from .test_underscore import clean_underscore # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def doc(en_vocab):
|
def doc(en_vocab):
|
||||||
words = ["c", "d", "e"]
|
words = ["c", "d", "e"]
|
||||||
|
spaces = [True, True, True]
|
||||||
pos = ["VERB", "NOUN", "NOUN"]
|
pos = ["VERB", "NOUN", "NOUN"]
|
||||||
tags = ["VBP", "NN", "NN"]
|
tags = ["VBP", "NN", "NN"]
|
||||||
heads = [0, 0, 1]
|
heads = [0, 0, 1]
|
||||||
|
@ -17,6 +20,7 @@ def doc(en_vocab):
|
||||||
return Doc(
|
return Doc(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=words,
|
words=words,
|
||||||
|
spaces=spaces,
|
||||||
pos=pos,
|
pos=pos,
|
||||||
tags=tags,
|
tags=tags,
|
||||||
heads=heads,
|
heads=heads,
|
||||||
|
@ -45,6 +49,47 @@ def doc_without_deps(en_vocab):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def doc_json():
|
||||||
|
return {
|
||||||
|
"text": "c d e ",
|
||||||
|
"ents": [{"start": 2, "end": 3, "label": "ORG"}],
|
||||||
|
"sents": [{"start": 0, "end": 5}],
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"start": 0,
|
||||||
|
"end": 1,
|
||||||
|
"tag": "VBP",
|
||||||
|
"pos": "VERB",
|
||||||
|
"morph": "Feat1=A",
|
||||||
|
"dep": "ROOT",
|
||||||
|
"head": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"start": 2,
|
||||||
|
"end": 3,
|
||||||
|
"tag": "NN",
|
||||||
|
"pos": "NOUN",
|
||||||
|
"morph": "Feat1=B",
|
||||||
|
"dep": "dobj",
|
||||||
|
"head": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"start": 4,
|
||||||
|
"end": 5,
|
||||||
|
"tag": "NN",
|
||||||
|
"pos": "NOUN",
|
||||||
|
"morph": "Feat1=A|Feat2=D",
|
||||||
|
"dep": "dobj",
|
||||||
|
"head": 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_doc_to_json(doc):
|
def test_doc_to_json(doc):
|
||||||
json_doc = doc.to_json()
|
json_doc = doc.to_json()
|
||||||
assert json_doc["text"] == "c d e "
|
assert json_doc["text"] == "c d e "
|
||||||
|
@ -56,7 +101,8 @@ def test_doc_to_json(doc):
|
||||||
assert json_doc["ents"][0]["start"] == 2 # character offset!
|
assert json_doc["ents"][0]["start"] == 2 # character offset!
|
||||||
assert json_doc["ents"][0]["end"] == 3 # character offset!
|
assert json_doc["ents"][0]["end"] == 3 # character offset!
|
||||||
assert json_doc["ents"][0]["label"] == "ORG"
|
assert json_doc["ents"][0]["label"] == "ORG"
|
||||||
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
def test_doc_to_json_underscore(doc):
|
def test_doc_to_json_underscore(doc):
|
||||||
|
@ -64,11 +110,96 @@ def test_doc_to_json_underscore(doc):
|
||||||
Doc.set_extension("json_test2", default=False)
|
Doc.set_extension("json_test2", default=False)
|
||||||
doc._.json_test1 = "hello world"
|
doc._.json_test1 = "hello world"
|
||||||
doc._.json_test2 = [1, 2, 3]
|
doc._.json_test2 = [1, 2, 3]
|
||||||
|
|
||||||
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
||||||
assert "_" in json_doc
|
assert "_" in json_doc
|
||||||
assert json_doc["_"]["json_test1"] == "hello world"
|
assert json_doc["_"]["json_test1"] == "hello world"
|
||||||
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||||
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_with_token_span_attributes(doc):
|
||||||
|
Doc.set_extension("json_test1", default=False)
|
||||||
|
Doc.set_extension("json_test2", default=False)
|
||||||
|
Token.set_extension("token_test", default=False)
|
||||||
|
Span.set_extension("span_test", default=False)
|
||||||
|
|
||||||
|
doc._.json_test1 = "hello world"
|
||||||
|
doc._.json_test2 = [1, 2, 3]
|
||||||
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0]._.token_test = 117
|
||||||
|
doc.spans["span_group"] = [doc[0:1]]
|
||||||
|
json_doc = doc.to_json(
|
||||||
|
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "_" in json_doc
|
||||||
|
assert json_doc["_"]["json_test1"] == "hello world"
|
||||||
|
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||||
|
assert "underscore_token" in json_doc
|
||||||
|
assert "underscore_span" in json_doc
|
||||||
|
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
||||||
|
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||||
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_with_custom_user_data(doc):
|
||||||
|
Doc.set_extension("json_test", default=False)
|
||||||
|
Token.set_extension("token_test", default=False)
|
||||||
|
Span.set_extension("span_test", default=False)
|
||||||
|
|
||||||
|
doc._.json_test = "hello world"
|
||||||
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0]._.token_test = 117
|
||||||
|
json_doc = doc.to_json(underscore=["json_test", "token_test", "span_test"])
|
||||||
|
doc.user_data["user_data_test"] = 10
|
||||||
|
doc.user_data[("user_data_test2", True)] = 10
|
||||||
|
|
||||||
|
assert "_" in json_doc
|
||||||
|
assert json_doc["_"]["json_test"] == "hello world"
|
||||||
|
assert "underscore_token" in json_doc
|
||||||
|
assert "underscore_span" in json_doc
|
||||||
|
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
||||||
|
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||||
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_with_token_span_same_identifier(doc):
|
||||||
|
Doc.set_extension("my_ext", default=False)
|
||||||
|
Token.set_extension("my_ext", default=False)
|
||||||
|
Span.set_extension("my_ext", default=False)
|
||||||
|
|
||||||
|
doc._.my_ext = "hello world"
|
||||||
|
doc[0:1]._.my_ext = "span_attribute"
|
||||||
|
doc[0]._.my_ext = 117
|
||||||
|
json_doc = doc.to_json(underscore=["my_ext"])
|
||||||
|
|
||||||
|
assert "_" in json_doc
|
||||||
|
assert json_doc["_"]["my_ext"] == "hello world"
|
||||||
|
assert "underscore_token" in json_doc
|
||||||
|
assert "underscore_span" in json_doc
|
||||||
|
assert json_doc["underscore_token"]["my_ext"]["value"] == 117
|
||||||
|
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
|
||||||
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_with_token_attributes_missing(doc):
|
||||||
|
Token.set_extension("token_test", default=False)
|
||||||
|
Span.set_extension("span_test", default=False)
|
||||||
|
|
||||||
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0]._.token_test = 117
|
||||||
|
json_doc = doc.to_json(underscore=["span_test"])
|
||||||
|
|
||||||
|
assert "underscore_token" in json_doc
|
||||||
|
assert "underscore_span" in json_doc
|
||||||
|
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||||
|
assert "token_test" not in json_doc["underscore_token"]
|
||||||
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_doc_to_json_underscore_error_attr(doc):
|
def test_doc_to_json_underscore_error_attr(doc):
|
||||||
|
@ -94,11 +225,29 @@ def test_doc_to_json_span(doc):
|
||||||
assert len(json_doc["spans"]) == 1
|
assert len(json_doc["spans"]) == 1
|
||||||
assert len(json_doc["spans"]["test"]) == 2
|
assert len(json_doc["spans"]["test"]) == 2
|
||||||
assert json_doc["spans"]["test"][0]["start"] == 0
|
assert json_doc["spans"]["test"][0]["start"] == 0
|
||||||
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_json_to_doc(doc):
|
def test_json_to_doc(doc):
|
||||||
new_doc = Doc(doc.vocab).from_json(doc.to_json(), validate=True)
|
json_doc = doc.to_json()
|
||||||
|
json_doc = srsly.json_loads(srsly.json_dumps(json_doc))
|
||||||
|
new_doc = Doc(doc.vocab).from_json(json_doc, validate=True)
|
||||||
|
assert new_doc.text == doc.text == "c d e "
|
||||||
|
assert len(new_doc) == len(doc) == 3
|
||||||
|
assert new_doc[0].pos == doc[0].pos
|
||||||
|
assert new_doc[0].tag == doc[0].tag
|
||||||
|
assert new_doc[0].dep == doc[0].dep
|
||||||
|
assert new_doc[0].head.idx == doc[0].head.idx
|
||||||
|
assert new_doc[0].lemma == doc[0].lemma
|
||||||
|
assert len(new_doc.ents) == 1
|
||||||
|
assert new_doc.ents[0].start == 1
|
||||||
|
assert new_doc.ents[0].end == 2
|
||||||
|
assert new_doc.ents[0].label_ == "ORG"
|
||||||
|
assert doc.to_bytes() == new_doc.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_to_doc_compat(doc, doc_json):
|
||||||
|
new_doc = Doc(doc.vocab).from_json(doc_json, validate=True)
|
||||||
new_tokens = [token for token in new_doc]
|
new_tokens = [token for token in new_doc]
|
||||||
assert new_doc.text == doc.text == "c d e "
|
assert new_doc.text == doc.text == "c d e "
|
||||||
assert len(new_tokens) == len([token for token in doc]) == 3
|
assert len(new_tokens) == len([token for token in doc]) == 3
|
||||||
|
@ -114,11 +263,8 @@ def test_json_to_doc(doc):
|
||||||
|
|
||||||
|
|
||||||
def test_json_to_doc_underscore(doc):
|
def test_json_to_doc_underscore(doc):
|
||||||
if not Doc.has_extension("json_test1"):
|
Doc.set_extension("json_test1", default=False)
|
||||||
Doc.set_extension("json_test1", default=False)
|
Doc.set_extension("json_test2", default=False)
|
||||||
if not Doc.has_extension("json_test2"):
|
|
||||||
Doc.set_extension("json_test2", default=False)
|
|
||||||
|
|
||||||
doc._.json_test1 = "hello world"
|
doc._.json_test1 = "hello world"
|
||||||
doc._.json_test2 = [1, 2, 3]
|
doc._.json_test2 = [1, 2, 3]
|
||||||
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
||||||
|
@ -126,6 +272,34 @@ def test_json_to_doc_underscore(doc):
|
||||||
assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
|
assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
|
||||||
assert new_doc._.json_test1 == "hello world"
|
assert new_doc._.json_test1 == "hello world"
|
||||||
assert new_doc._.json_test2 == [1, 2, 3]
|
assert new_doc._.json_test2 == [1, 2, 3]
|
||||||
|
assert doc.to_bytes() == new_doc.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_to_doc_with_token_span_attributes(doc):
|
||||||
|
Doc.set_extension("json_test1", default=False)
|
||||||
|
Doc.set_extension("json_test2", default=False)
|
||||||
|
Token.set_extension("token_test", default=False)
|
||||||
|
Span.set_extension("span_test", default=False)
|
||||||
|
doc._.json_test1 = "hello world"
|
||||||
|
doc._.json_test2 = [1, 2, 3]
|
||||||
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0]._.token_test = 117
|
||||||
|
|
||||||
|
json_doc = doc.to_json(
|
||||||
|
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||||
|
)
|
||||||
|
json_doc = srsly.json_loads(srsly.json_dumps(json_doc))
|
||||||
|
new_doc = Doc(doc.vocab).from_json(json_doc, validate=True)
|
||||||
|
|
||||||
|
assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
|
||||||
|
assert new_doc._.json_test1 == "hello world"
|
||||||
|
assert new_doc._.json_test2 == [1, 2, 3]
|
||||||
|
assert new_doc[0]._.token_test == 117
|
||||||
|
assert new_doc[0:1]._.span_test == "span_attribute"
|
||||||
|
assert new_doc.user_data == doc.user_data
|
||||||
|
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
|
||||||
|
exclude=["user_data"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_json_to_doc_spans(doc):
|
def test_json_to_doc_spans(doc):
|
||||||
|
|
0
spacy/tests/lang/lg/__init__.py
Normal file
0
spacy/tests/lang/lg/__init__.py
Normal file
15
spacy/tests/lang/lg/test_tokenizer.py
Normal file
15
spacy/tests/lang/lg/test_tokenizer.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
LG_BASIC_TOKENIZATION_TESTS = [
|
||||||
|
(
|
||||||
|
"Abooluganda ab’emmamba ababiri",
|
||||||
|
["Abooluganda", "ab’emmamba", "ababiri"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", LG_BASIC_TOKENIZATION_TESTS)
|
||||||
|
def test_lg_tokenizer_basic(lg_tokenizer, text, expected_tokens):
|
||||||
|
tokens = lg_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
|
@ -1,5 +1,6 @@
|
||||||
from spacy.tokens import Doc
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.util import filter_spans
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking):
|
||||||
"""
|
"""
|
||||||
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
|
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
|
||||||
assert chunks == nl_reference_chunking
|
assert chunks == nl_reference_chunking
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10846)
|
||||||
|
def test_no_overlapping_chunks(nl_vocab):
|
||||||
|
# fmt: off
|
||||||
|
doc = Doc(
|
||||||
|
nl_vocab,
|
||||||
|
words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
|
||||||
|
deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
|
||||||
|
heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
|
||||||
|
pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert filter_spans(chunks) == chunks
|
||||||
|
|
|
@ -2,6 +2,9 @@ import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||||
words = ["мама", "мыла", "раму"]
|
words = ["мама", "мыла", "раму"]
|
||||||
pos = ["NOUN", "VERB", "NOUN"]
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
|
import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_uk_lemmatizer(uk_lemmatizer):
|
def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import warnings
|
||||||
import srsly
|
import srsly
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
|
|
||||||
|
@ -344,13 +345,13 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
matcher.add("TEST1", [doc1])
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
matcher.add("TEST2", [doc2])
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST3", [doc3])
|
matcher.add("TEST3", [doc3])
|
||||||
assert not record.list
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST4", [doc2])
|
matcher.add("TEST4", [doc2])
|
||||||
assert not record.list
|
|
||||||
|
|
||||||
|
|
||||||
def test_attr_validation(en_vocab):
|
def test_attr_validation(en_vocab):
|
||||||
|
|
|
@ -1048,6 +1048,10 @@ def test_no_gold_ents(patterns):
|
||||||
for eg in train_examples:
|
for eg in train_examples:
|
||||||
eg.predicted = ruler(eg.predicted)
|
eg.predicted = ruler(eg.predicted)
|
||||||
|
|
||||||
|
# Entity ruler is no longer needed (initialization below wipes out the
|
||||||
|
# patterns and causes warnings)
|
||||||
|
nlp.remove_pipe("entity_ruler")
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||||
|
|
|
@ -659,3 +659,14 @@ def test_multiprocessing_gpu_warning(nlp2, texts):
|
||||||
# Trigger multi-processing.
|
# Trigger multi-processing.
|
||||||
for _ in docs:
|
for _ in docs:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_dot_in_factory_names(nlp):
|
||||||
|
Language.component("my_evil_component", func=evil_component)
|
||||||
|
nlp.add_pipe("my_evil_component")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="not permitted"):
|
||||||
|
Language.component("my.evil.component.v1", func=evil_component)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="not permitted"):
|
||||||
|
Language.factory("my.evil.component.v1", func=evil_component)
|
||||||
|
|
|
@ -431,3 +431,41 @@ def test_Example_aligned_whitespace(en_vocab):
|
||||||
|
|
||||||
example = Example(predicted, reference)
|
example = Example(predicted, reference)
|
||||||
assert example.get_aligned("TAG", as_string=True) == tags
|
assert example.get_aligned("TAG", as_string=True) == tags
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue("11260")
|
||||||
|
def test_issue11260():
|
||||||
|
annots = {
|
||||||
|
"words": ["I", "like", "New", "York", "."],
|
||||||
|
"spans": {
|
||||||
|
"cities": [(7, 15, "LOC", "")],
|
||||||
|
"people": [(0, 1, "PERSON", "")],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
example = Example.from_dict(predicted, annots)
|
||||||
|
assert len(example.reference.spans["cities"]) == 1
|
||||||
|
assert len(example.reference.spans["people"]) == 1
|
||||||
|
|
||||||
|
output_dict = example.to_dict()
|
||||||
|
assert "spans" in output_dict["doc_annotation"]
|
||||||
|
assert output_dict["doc_annotation"]["spans"]["cities"] == annots["spans"]["cities"]
|
||||||
|
assert output_dict["doc_annotation"]["spans"]["people"] == annots["spans"]["people"]
|
||||||
|
|
||||||
|
output_example = Example.from_dict(predicted, output_dict)
|
||||||
|
|
||||||
|
assert len(output_example.reference.spans["cities"]) == len(
|
||||||
|
example.reference.spans["cities"]
|
||||||
|
)
|
||||||
|
assert len(output_example.reference.spans["people"]) == len(
|
||||||
|
example.reference.spans["people"]
|
||||||
|
)
|
||||||
|
for span in example.reference.spans["cities"]:
|
||||||
|
assert span.label_ == "LOC"
|
||||||
|
assert span.text == "New York"
|
||||||
|
assert span.start_char == 7
|
||||||
|
for span in example.reference.spans["people"]:
|
||||||
|
assert span.label_ == "PERSON"
|
||||||
|
assert span.text == "I"
|
||||||
|
assert span.start_char == 0
|
||||||
|
|
|
@ -1602,13 +1602,30 @@ cdef class Doc:
|
||||||
ents.append(char_span)
|
ents.append(char_span)
|
||||||
self.ents = ents
|
self.ents = ents
|
||||||
|
|
||||||
# Add custom attributes. Note that only Doc extensions are currently considered, Token and Span extensions are
|
# Add custom attributes for the whole Doc object.
|
||||||
# not yet supported.
|
|
||||||
for attr in doc_json.get("_", {}):
|
for attr in doc_json.get("_", {}):
|
||||||
if not Doc.has_extension(attr):
|
if not Doc.has_extension(attr):
|
||||||
Doc.set_extension(attr)
|
Doc.set_extension(attr)
|
||||||
self._.set(attr, doc_json["_"][attr])
|
self._.set(attr, doc_json["_"][attr])
|
||||||
|
|
||||||
|
if doc_json.get("underscore_token", {}):
|
||||||
|
for token_attr in doc_json["underscore_token"]:
|
||||||
|
token_start = doc_json["underscore_token"][token_attr]["token_start"]
|
||||||
|
value = doc_json["underscore_token"][token_attr]["value"]
|
||||||
|
|
||||||
|
if not Token.has_extension(token_attr):
|
||||||
|
Token.set_extension(token_attr)
|
||||||
|
self[token_start]._.set(token_attr, value)
|
||||||
|
|
||||||
|
if doc_json.get("underscore_span", {}):
|
||||||
|
for span_attr in doc_json["underscore_span"]:
|
||||||
|
token_start = doc_json["underscore_span"][span_attr]["token_start"]
|
||||||
|
token_end = doc_json["underscore_span"][span_attr]["token_end"]
|
||||||
|
value = doc_json["underscore_span"][span_attr]["value"]
|
||||||
|
|
||||||
|
if not Span.has_extension(span_attr):
|
||||||
|
Span.set_extension(span_attr)
|
||||||
|
self[token_start:token_end]._.set(span_attr, value)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_json(self, underscore=None):
|
def to_json(self, underscore=None):
|
||||||
|
@ -1650,20 +1667,40 @@ cdef class Doc:
|
||||||
for span_group in self.spans:
|
for span_group in self.spans:
|
||||||
data["spans"][span_group] = []
|
data["spans"][span_group] = []
|
||||||
for span in self.spans[span_group]:
|
for span in self.spans[span_group]:
|
||||||
span_data = {
|
span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_}
|
||||||
"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_
|
|
||||||
}
|
|
||||||
data["spans"][span_group].append(span_data)
|
data["spans"][span_group].append(span_data)
|
||||||
|
|
||||||
if underscore:
|
if underscore:
|
||||||
data["_"] = {}
|
user_keys = set()
|
||||||
|
if self.user_data:
|
||||||
|
data["_"] = {}
|
||||||
|
data["underscore_token"] = {}
|
||||||
|
data["underscore_span"] = {}
|
||||||
|
for data_key in self.user_data:
|
||||||
|
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
|
||||||
|
attr = data_key[1]
|
||||||
|
start = data_key[2]
|
||||||
|
end = data_key[3]
|
||||||
|
if attr in underscore:
|
||||||
|
user_keys.add(attr)
|
||||||
|
value = self.user_data[data_key]
|
||||||
|
if not srsly.is_json_serializable(value):
|
||||||
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||||
|
# Check if doc attribute
|
||||||
|
if start is None:
|
||||||
|
data["_"][attr] = value
|
||||||
|
# Check if token attribute
|
||||||
|
elif end is None:
|
||||||
|
if attr not in data["underscore_token"]:
|
||||||
|
data["underscore_token"][attr] = {"token_start": start, "value": value}
|
||||||
|
# Else span attribute
|
||||||
|
else:
|
||||||
|
if attr not in data["underscore_span"]:
|
||||||
|
data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
|
||||||
|
|
||||||
for attr in underscore:
|
for attr in underscore:
|
||||||
if not self.has_extension(attr):
|
if attr not in user_keys:
|
||||||
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
||||||
value = self._.get(attr)
|
|
||||||
if not srsly.is_json_serializable(value):
|
|
||||||
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
|
||||||
data["_"][attr] = value
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def to_utf8_array(self, int nr_char=-1):
|
def to_utf8_array(self, int nr_char=-1):
|
||||||
|
|
|
@ -361,6 +361,7 @@ cdef class Example:
|
||||||
"doc_annotation": {
|
"doc_annotation": {
|
||||||
"cats": dict(self.reference.cats),
|
"cats": dict(self.reference.cats),
|
||||||
"entities": doc_to_biluo_tags(self.reference),
|
"entities": doc_to_biluo_tags(self.reference),
|
||||||
|
"spans": self._spans_to_dict(),
|
||||||
"links": self._links_to_dict()
|
"links": self._links_to_dict()
|
||||||
},
|
},
|
||||||
"token_annotation": {
|
"token_annotation": {
|
||||||
|
@ -376,6 +377,18 @@ cdef class Example:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _spans_to_dict(self):
|
||||||
|
span_dict = {}
|
||||||
|
for key in self.reference.spans:
|
||||||
|
span_tuples = []
|
||||||
|
for span in self.reference.spans[key]:
|
||||||
|
span_tuple = (span.start_char, span.end_char, span.label_, span.kb_id_)
|
||||||
|
span_tuples.append(span_tuple)
|
||||||
|
span_dict[key] = span_tuples
|
||||||
|
|
||||||
|
return span_dict
|
||||||
|
|
||||||
|
|
||||||
def _links_to_dict(self):
|
def _links_to_dict(self):
|
||||||
links = {}
|
links = {}
|
||||||
for ent in self.reference.ents:
|
for ent in self.reference.ents:
|
||||||
|
|
|
@ -337,3 +337,5 @@ def ensure_shape(vectors_loc):
|
||||||
# store all the results in a list in memory
|
# store all the results in a list in memory
|
||||||
lines2 = open_file(vectors_loc)
|
lines2 = open_file(vectors_loc)
|
||||||
yield from lines2
|
yield from lines2
|
||||||
|
lines2.close()
|
||||||
|
lines.close()
|
||||||
|
|
|
@ -395,12 +395,13 @@ file to keep track of your settings and hyperparameters and your own
|
||||||
> "pos": List[str],
|
> "pos": List[str],
|
||||||
> "morphs": List[str],
|
> "morphs": List[str],
|
||||||
> "sent_starts": List[Optional[bool]],
|
> "sent_starts": List[Optional[bool]],
|
||||||
> "deps": List[string],
|
> "deps": List[str],
|
||||||
> "heads": List[int],
|
> "heads": List[int],
|
||||||
> "entities": List[str],
|
> "entities": List[str],
|
||||||
> "entities": List[(int, int, str)],
|
> "entities": List[(int, int, str)],
|
||||||
> "cats": Dict[str, float],
|
> "cats": Dict[str, float],
|
||||||
> "links": Dict[(int, int), dict],
|
> "links": Dict[(int, int), dict],
|
||||||
|
> "spans": Dict[str, List[Tuple]],
|
||||||
> }
|
> }
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -417,9 +418,10 @@ file to keep track of your settings and hyperparameters and your own
|
||||||
| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
|
| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
|
||||||
| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
|
| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
|
||||||
| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
|
| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
|
||||||
| `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
|
| `entities` | **Option 2:** List of `(start_char, end_char, label)` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
|
||||||
| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
|
| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
|
||||||
| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
|
| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
|
||||||
|
| `spans` | Dictionary of `spans_key`/`List[Tuple]` pairs defining the spans for each spans key as `(start_char, end_char, label, kb_id)` tuples. ~~Dict[str, List[Tuple[int, int, str, str]]~~ |
|
||||||
|
|
||||||
<Infobox title="Notes and caveats">
|
<Infobox title="Notes and caveats">
|
||||||
|
|
||||||
|
|
|
@ -240,7 +240,7 @@ browser. Will run a simple web server.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
||||||
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
||||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||||
|
@ -265,7 +265,7 @@ Render a dependency parse tree or named entity visualization.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
|
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
|
||||||
| `style` | Visualization style,`"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
||||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||||
|
@ -273,6 +273,73 @@ Render a dependency parse tree or named entity visualization.
|
||||||
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
|
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
|
||||||
| **RETURNS** | The rendered HTML markup. ~~str~~ |
|
| **RETURNS** | The rendered HTML markup. ~~str~~ |
|
||||||
|
|
||||||
|
### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"}
|
||||||
|
|
||||||
|
Generate dependency parse in `{'words': [], 'arcs': []}` format.
|
||||||
|
For use with the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> from spacy import displacy
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> doc = nlp("This is a sentence.")
|
||||||
|
> deps_parse = displacy.parse_deps(doc)
|
||||||
|
> html = displacy.render(deps_parse, style="dep", manual=True)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
|
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ |
|
||||||
|
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
|
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
||||||
|
|
||||||
|
### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"}
|
||||||
|
|
||||||
|
Generate named entities in `[{start: i, end: i, label: 'label'}]` format.
|
||||||
|
For use with the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> from spacy import displacy
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> doc = nlp("But Google is starting from behind.")
|
||||||
|
> ents_parse = displacy.parse_ents(doc)
|
||||||
|
> html = displacy.render(ents_parse, style="ent", manual=True)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
|
| `doc` | Doc to parse entities. ~~Doc~~ |
|
||||||
|
| `options` | NER-specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
|
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
|
||||||
|
|
||||||
|
### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"}
|
||||||
|
|
||||||
|
Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format.
|
||||||
|
For use with the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> from spacy import displacy
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> doc = nlp("But Google is starting from behind.")
|
||||||
|
> doc.spans['orgs'] = [doc[1:2]]
|
||||||
|
> ents_parse = displacy.parse_spans(doc, options={"spans_key" : "orgs"})
|
||||||
|
> html = displacy.render(ents_parse, style="span", manual=True)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
|
| `doc` | Doc to parse entities. ~~Doc~~ |
|
||||||
|
| `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
|
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
|
||||||
|
|
||||||
### Visualizer options {#displacy_options}
|
### Visualizer options {#displacy_options}
|
||||||
|
|
||||||
The `options` argument lets you specify additional settings for each visualizer.
|
The `options` argument lets you specify additional settings for each visualizer.
|
||||||
|
|
|
@ -11,8 +11,8 @@ menu:
|
||||||
- ['Tokenization', 'tokenization']
|
- ['Tokenization', 'tokenization']
|
||||||
- ['Merging & Splitting', 'retokenization']
|
- ['Merging & Splitting', 'retokenization']
|
||||||
- ['Sentence Segmentation', 'sbd']
|
- ['Sentence Segmentation', 'sbd']
|
||||||
- ['Vectors & Similarity', 'vectors-similarity']
|
|
||||||
- ['Mappings & Exceptions', 'mappings-exceptions']
|
- ['Mappings & Exceptions', 'mappings-exceptions']
|
||||||
|
- ['Vectors & Similarity', 'vectors-similarity']
|
||||||
- ['Language Data', 'language-data']
|
- ['Language Data', 'language-data']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -198,12 +198,12 @@ import DisplacySpanHtml from 'images/displacy-span.html'
|
||||||
|
|
||||||
The span visualizer lets you customize the following `options`:
|
The span visualizer lets you customize the following `options`:
|
||||||
|
|
||||||
| Argument | Description |
|
| Argument | Description |
|
||||||
|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ |
|
| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ |
|
||||||
| `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
|
| `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
|
||||||
| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ |
|
| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ |
|
||||||
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
|
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
|
||||||
|
|
||||||
Because spans can be stored across different keys in `doc.spans`, you need to specify
|
Because spans can be stored across different keys in `doc.spans`, you need to specify
|
||||||
which one displaCy should use with `spans_key` (`sc` is the default).
|
which one displaCy should use with `spans_key` (`sc` is the default).
|
||||||
|
@ -343,9 +343,21 @@ want to visualize output from other libraries, like [NLTK](http://www.nltk.org)
|
||||||
or
|
or
|
||||||
[SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
|
[SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
|
||||||
If you set `manual=True` on either `render()` or `serve()`, you can pass in data
|
If you set `manual=True` on either `render()` or `serve()`, you can pass in data
|
||||||
in displaCy's format as a dictionary (instead of `Doc` objects).
|
in displaCy's format as a dictionary (instead of `Doc` objects). There are helper
|
||||||
|
functions for converting `Doc` objects to displaCy's format for use with `manual=True`:
|
||||||
|
[`displacy.parse_deps`](/api/top-level#displacy.parse_deps),
|
||||||
|
[`displacy.parse_ents`](/api/top-level#displacy.parse_ents),
|
||||||
|
and [`displacy.parse_spans`](/api/top-level#displacy.parse_spans).
|
||||||
|
|
||||||
> #### Example
|
> #### Example with parse function
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("But Google is starting from behind.")
|
||||||
|
> ex = displacy.parse_ents(doc)
|
||||||
|
> html = displacy.render(ex, style="ent", manual=True)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
> #### Example with raw data
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ex = [{"text": "But Google is starting from behind.",
|
> ex = [{"text": "But Google is starting from behind.",
|
||||||
|
@ -354,6 +366,7 @@ in displaCy's format as a dictionary (instead of `Doc` objects).
|
||||||
> html = displacy.render(ex, style="ent", manual=True)
|
> html = displacy.render(ex, style="ent", manual=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### DEP input
|
### DEP input
|
||||||
{
|
{
|
||||||
|
@ -389,6 +402,18 @@ in displaCy's format as a dictionary (instead of `Doc` objects).
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### SPANS input
|
||||||
|
{
|
||||||
|
"text": "Welcome to the Bank of China.",
|
||||||
|
"spans": [
|
||||||
|
{"start_token": 3, "end_token": 6, "label": "ORG"},
|
||||||
|
{"start_token": 5, "end_token": 6, "label": "GPE"},
|
||||||
|
],
|
||||||
|
"tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Using displaCy in a web application {#webapp}
|
## Using displaCy in a web application {#webapp}
|
||||||
|
|
||||||
If you want to use the visualizers as part of a web application, for example to
|
If you want to use the visualizers as part of a web application, for example to
|
||||||
|
|
|
@ -265,6 +265,11 @@
|
||||||
"name": "Luxembourgish",
|
"name": "Luxembourgish",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "lg",
|
||||||
|
"name": "Luganda",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "lij",
|
"code": "lij",
|
||||||
"name": "Ligurian",
|
"name": "Ligurian",
|
||||||
|
@ -467,10 +472,20 @@
|
||||||
"code": "uk",
|
"code": "uk",
|
||||||
"name": "Ukrainian",
|
"name": "Ukrainian",
|
||||||
"has_examples": true,
|
"has_examples": true,
|
||||||
|
"models": [
|
||||||
|
"uk_core_news_sm",
|
||||||
|
"uk_core_news_md",
|
||||||
|
"uk_core_news_lg",
|
||||||
|
"uk_core_news_trf"
|
||||||
|
],
|
||||||
"dependencies": [
|
"dependencies": [
|
||||||
{
|
{
|
||||||
"name": "pymorphy3",
|
"name": "pymorphy3",
|
||||||
"url": "https://github.com/no-plagiarism/pymorphy3"
|
"url": "https://github.com/no-plagiarism/pymorphy3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "pymorphy3-dicts-uk",
|
||||||
|
"url": "https://github.com/no-plagiarism/pymorphy3-dicts"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -114,7 +114,11 @@ function formatVectors(data) {
|
||||||
if (!data) return 'n/a'
|
if (!data) return 'n/a'
|
||||||
if (Object.values(data).every(n => n === 0)) return 'context vectors only'
|
if (Object.values(data).every(n => n === 0)) return 'context vectors only'
|
||||||
const { keys, vectors, width } = data
|
const { keys, vectors, width } = data
|
||||||
return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)`
|
if (keys >= 0) {
|
||||||
|
return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)`
|
||||||
|
} else {
|
||||||
|
return `${abbrNum(vectors)} floret vectors (${width} dimensions)`
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function formatAccuracy(data, lang) {
|
function formatAccuracy(data, lang) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user