From 229ecaf0ea69ad586587ea70b8a90d59e0e64005 Mon Sep 17 00:00:00 2001 From: Duy Ngo <91310922+thebugcreator@users.noreply.github.com> Date: Mon, 18 Apr 2022 12:58:32 +0200 Subject: [PATCH 01/42] Add numbers and definitions (#10665) --- spacy/lang/vi/lex_attrs.py | 39 ++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py index 33a3745cc..9f931446f 100644 --- a/spacy/lang/vi/lex_attrs.py +++ b/spacy/lang/vi/lex_attrs.py @@ -2,22 +2,29 @@ from ...attrs import LIKE_NUM _num_words = [ - "không", - "một", - "hai", - "ba", - "bốn", - "năm", - "sáu", - "bảy", - "bẩy", - "tám", - "chín", - "mười", - "chục", - "trăm", - "nghìn", - "tỷ", + "không", # Zero + "một", # One + "mốt", # Also one, irreplacable in niché cases for unit digit such as "51"="năm mươi mốt" + "hai", # Two + "ba", # Three + "bốn", # Four + "tư", # Also four, used in certain cases for unit digit such as "54"="năm mươi tư" + "năm", # Five + "lăm", # Also five, irreplacable in niché cases for unit digit such as "55"="năm mươi lăm" + "sáu", # Six + "bảy", # Seven + "bẩy", # Also seven, old fashioned + "tám", # Eight + "chín", # Nine + "mười", # Ten + "chục", # Also ten, used for counting in tens such as "20 eggs"="hai chục trứng" + "trăm", # Hundred + "nghìn", # Thousand + "ngàn", # Also thousand, used in the south + "vạn", # Ten thousand + "triệu", # Million + "tỷ", # Billion + "tỉ" # Also billion, used in combinatorics such as "tỉ_phú"="billionaire" ] From aa6780eb27f9d37b4db017061de1b6c2ba753632 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 18 Apr 2022 12:59:34 +0200 Subject: [PATCH 02/42] `Matcher`: Remove superfluous GIL-acquiring check in `get_is_final` (#10659) * `Matcher`: Remove superfluous GIL-acquiring check in `get_is_final` This check incurred a significant performance penalty due to implict interactions between the GIL and Cython ref-counting code. * `Matcher`: Inline `PatternStateC` accessors --- spacy/matcher/matcher.pyx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index e75ee9ce2..e43583e30 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -690,18 +690,14 @@ cdef int8_t get_is_match(PatternStateC state, return True -cdef int8_t get_is_final(PatternStateC state) nogil: +cdef inline int8_t get_is_final(PatternStateC state) nogil: if state.pattern[1].quantifier == FINAL_ID: - id_attr = state.pattern[1].attrs[0] - if id_attr.attr != ID: - with gil: - raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr)) return 1 else: return 0 -cdef int8_t get_quantifier(PatternStateC state) nogil: +cdef inline int8_t get_quantifier(PatternStateC state) nogil: return state.pattern.quantifier From 2a2654c756be2296777f0e6e70a0b62533e4e262 Mon Sep 17 00:00:00 2001 From: mgr Date: Mon, 18 Apr 2022 22:04:02 +0200 Subject: [PATCH 03/42] Remove significant or not very frequent words from stop word list [es] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The list of stop words for Spanish contained many inadequate words, see: https://github.com/explosion/spaCy/issues/3052#issuecomment-1100760100 Removed words: - verb forms of 'trabajar' (work) and intentar (try) - words related to 'empleo' (employment) - incorrect words: ampleamos, arribaabajo, soyos, paìs - miscellaneous words due to being too significant of too infrequent: actualmente, aproximadamente, antaño, cosas, ejemplo, horas, general, pais, principalmente, raras Added other stop words for completion: - Spanish one-letter words - numbers up to twelve Some reformatting to 79 columns. When in doubt, the English and German lists have been consulted as good examples. --- spacy/lang/es/stop_words.py | 84 ++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py index 004df4fca..6d2885481 100644 --- a/spacy/lang/es/stop_words.py +++ b/spacy/lang/es/stop_words.py @@ -1,82 +1,80 @@ STOP_WORDS = set( """ -actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí -al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos -antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas -aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo -aseguró asi así atras aun aunque ayer añadió aún +a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna +algunas alguno algunos algún alli allí alrededor ambos ante anterior antes +apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél +aquélla aquéllas aquéllos aquí arriba aseguró asi así atras aun aunque añadió +aún bajo bastante bien breve buen buena buenas bueno buenos -cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con -conmigo conocer conseguimos conseguir considera consideró consigo consigue -consiguen consigues contigo contra cosas creo cual cuales cualquier cuando -cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas -cuánto cuántos cómo +cada casi cierta ciertas cierto ciertos cinco claro comentó como con conmigo +conocer conseguimos conseguir considera consideró consigo consigue consiguen +consigues contigo contra creo cual cuales cualquier cuando cuanta cuantas +cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas cuánto cuántos +cómo da dado dan dar de debajo debe deben debido decir dejó del delante demasiado demás dentro deprisa desde despacio despues después detras detrás dia dias dice -dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día -días dónde +dicen dicho dieron diez diferente diferentes dijeron dijo dio doce donde dos +durante día días dónde -ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas -empleo en encima encuentra enfrente enseguida entonces entre era eramos eran -eras eres es esa esas ese eso esos esta estaba estaban estado estados estais -estamos estan estar estará estas este esto estos estoy estuvo está están ex -excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste -éstos +e el ella ellas ello ellos embargo en encima encuentra enfrente enseguida +entonces entre era eramos eran eras eres es esa esas ese eso esos esta estaba +estaban estado estados estais estamos estan estar estará estas este esto estos +estoy estuvo está están excepto existe existen explicó expresó él ésa ésas ése +ésos ésta éstas éste éstos fin final fue fuera fueron fui fuimos -general gran grandes gueno +gran grande grandes ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron -hizo horas hoy hubo +hizo hoy hubo -igual incluso indicó informo informó intenta intentais intentamos intentan -intentar intentas intento ir +igual incluso indicó informo informó ir junto -la lado largo las le lejos les llegó lleva llevar lo los luego lugar +la lado largo las le les llegó lleva llevar lo los luego mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi -mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha -muchas mucho muchos muy más mí mía mías mío míos +mia mias mientras mio mios mis misma mismas mismo mismos modo mucha muchas +mucho muchos muy más mí mía mías mío míos nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros -nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca +nuestra nuestras nuestro nuestros nueva nuevas nueve nuevo nuevos nunca -ocho os otra otras otro otros +o ocho once os otra otras otro otros -pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas -poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá +para parece parte partir pasada pasado paìs peor pero pesar poca pocas poco +pocos podeis podemos poder podria podriais podriamos podrian podrias podrá podrán podría podrían poner por porque posible primer primera primero primeros -principalmente pronto propia propias propio propios proximo próximo próximos -pudo pueda puede pueden puedo pues +pronto propia propias propio propios proximo próximo próximos pudo pueda puede +pueden puedo pues -qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué +qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién +quiénes qué -raras realizado realizar realizó repente respecto +realizado realizar realizó repente respecto sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo según seis ser sera será serán sería señaló si sido siempre siendo siete sigue -siguiente sin sino sobre sois sola solamente solas solo solos somos son soy -soyos su supuesto sus suya suyas suyo sé sí sólo +siguiente sin sino sobre sois sola solamente solas solo solos somos son soy su +supuesto sus suya suyas suyo suyos sé sí sólo tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis -tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda -todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan -trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo -tuyos tú +tenemos tener tenga tengo tenido tenía tercera tercero ti tiene tienen toda +todas todavia todavía todo todos total tras trata través tres tu tus tuvo tuya +tuyas tuyo tuyos tú -ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes +u ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes última últimas último últimos -va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero -vez vosotras vosotros voy vuestra vuestras vuestro vuestros +va vais vamos van varias varios vaya veces ver verdad verdadera verdadero vez +vosotras vosotros voy vuestra vuestras vuestro vuestros -ya yo +y ya yo """.split() ) From 3d50b1a9898d91c1a3bf796af0e849020d564480 Mon Sep 17 00:00:00 2001 From: mgr Date: Mon, 18 Apr 2022 22:12:57 +0200 Subject: [PATCH 04/42] Fix some issues in Spanish examples - Spelling: nationalities in lowercase, accent. - Incorrect verb composition - Untranslated word --- spacy/lang/es/examples.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 2bcbd8740..e4dfbcb6d 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -9,14 +9,14 @@ Example sentences to test spaCy and its language models. sentences = [ "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.", "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.", - "San Francisco analiza prohibir los robots delivery.", + "San Francisco analiza prohibir los robots de reparto.", "Londres es una gran ciudad del Reino Unido.", "El gato come pescado.", "Veo al hombre con el telescopio.", "La araña come moscas.", "El pingüino incuba en su nido sobre el hielo.", - "¿Dónde estais?", - "¿Quién es el presidente Francés?", - "¿Dónde está encuentra la capital de Argentina?", + "¿Dónde estáis?", + "¿Quién es el presidente francés?", + "¿Dónde se encuentra la capital de Argentina?", "¿Cuándo nació José de San Martín?", ] From 29afbdb91e5fecf513125a85f1ac1d165f40bc93 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Wed, 20 Apr 2022 10:52:34 +0300 Subject: [PATCH 05/42] add readme for explosion-bot (#10677) --- extra/DEVELOPER_DOCS/ExplosionBot.md | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 extra/DEVELOPER_DOCS/ExplosionBot.md diff --git a/extra/DEVELOPER_DOCS/ExplosionBot.md b/extra/DEVELOPER_DOCS/ExplosionBot.md new file mode 100644 index 000000000..403625550 --- /dev/null +++ b/extra/DEVELOPER_DOCS/ExplosionBot.md @@ -0,0 +1,32 @@ +# Explosion-bot + +Explosion-bot is a robot that can be invoked to help with running particular test commands. + +## Permissions + +Only maintainers have permissions to summon explosion-bot. Each of the open source repos that use explosion-bot has its own team(s) of maintainers, and only github users who are members of those teams can successfully run bot commands. + +## Running robot commands + +To summon the robot, write a github comment on the issue/PR you wish to test. The comment must be in the following format: + +``` +@explosion-bot please test_gpu +``` + +Some things to note: + +* The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple! +* The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there. +* The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test. +* For the `test_gpu` command, you can specify an optional thinc branch (from the spaCy repo) or a spaCy branch (from the thinc repo) with either the `--thinc-branch` or `--spacy-branch` flags. By default, the bot will pull in the PR branch from the repo where the command was issued, and the main branch of the other repository. However, if you need to run against another branch, you can say (for example): + +``` +@explosion-bot please test_gpu --thinc-branch develop +``` + +## Troubleshooting + +If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml). + +For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered. \ No newline at end of file From 2c2dbb844c784937d7664a97239c95bbf86326ed Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 21 Apr 2022 12:55:22 +0200 Subject: [PATCH 06/42] Syntax for a branch from a PR --- extra/DEVELOPER_DOCS/ExplosionBot.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/extra/DEVELOPER_DOCS/ExplosionBot.md b/extra/DEVELOPER_DOCS/ExplosionBot.md index 403625550..eebec1a06 100644 --- a/extra/DEVELOPER_DOCS/ExplosionBot.md +++ b/extra/DEVELOPER_DOCS/ExplosionBot.md @@ -24,9 +24,13 @@ Some things to note: ``` @explosion-bot please test_gpu --thinc-branch develop ``` +You can also specify a branch from an unmerged PR: +``` +@explosion-bot please test_gpu --thinc-branch refs/pull/633/head +``` ## Troubleshooting If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml). -For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered. \ No newline at end of file +For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered. From e07500369c654cbd3bf2090838a88a00c600ae2f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 22 Apr 2022 11:24:53 +0200 Subject: [PATCH 07/42] Auto-format code with black (#10687) Co-authored-by: explosion-bot --- spacy/lang/vi/lex_attrs.py | 46 +++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py index 9f931446f..0cbda4ffb 100644 --- a/spacy/lang/vi/lex_attrs.py +++ b/spacy/lang/vi/lex_attrs.py @@ -2,29 +2,29 @@ from ...attrs import LIKE_NUM _num_words = [ - "không", # Zero - "một", # One - "mốt", # Also one, irreplacable in niché cases for unit digit such as "51"="năm mươi mốt" - "hai", # Two - "ba", # Three - "bốn", # Four - "tư", # Also four, used in certain cases for unit digit such as "54"="năm mươi tư" - "năm", # Five - "lăm", # Also five, irreplacable in niché cases for unit digit such as "55"="năm mươi lăm" - "sáu", # Six - "bảy", # Seven - "bẩy", # Also seven, old fashioned - "tám", # Eight - "chín", # Nine - "mười", # Ten - "chục", # Also ten, used for counting in tens such as "20 eggs"="hai chục trứng" - "trăm", # Hundred - "nghìn", # Thousand - "ngàn", # Also thousand, used in the south - "vạn", # Ten thousand - "triệu", # Million - "tỷ", # Billion - "tỉ" # Also billion, used in combinatorics such as "tỉ_phú"="billionaire" + "không", # Zero + "một", # One + "mốt", # Also one, irreplacable in niché cases for unit digit such as "51"="năm mươi mốt" + "hai", # Two + "ba", # Three + "bốn", # Four + "tư", # Also four, used in certain cases for unit digit such as "54"="năm mươi tư" + "năm", # Five + "lăm", # Also five, irreplacable in niché cases for unit digit such as "55"="năm mươi lăm" + "sáu", # Six + "bảy", # Seven + "bẩy", # Also seven, old fashioned + "tám", # Eight + "chín", # Nine + "mười", # Ten + "chục", # Also ten, used for counting in tens such as "20 eggs"="hai chục trứng" + "trăm", # Hundred + "nghìn", # Thousand + "ngàn", # Also thousand, used in the south + "vạn", # Ten thousand + "triệu", # Million + "tỷ", # Billion + "tỉ", # Also billion, used in combinatorics such as "tỉ_phú"="billionaire" ] From 3b208197c3d7288c57bfe9831a3334b4662b416f Mon Sep 17 00:00:00 2001 From: Mike Date: Mon, 25 Apr 2022 16:40:54 +0200 Subject: [PATCH 08/42] Fixed example for spacy_syllables (#10705) There was a typo in the example for the spacy_syllables project. --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index ccd75c0c3..e67c78716 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3212,7 +3212,7 @@ "", "assert nlp.pipe_names == [\"tok2vec\", \"tagger\", \"syllables\", \"parser\", \"attribute_ruler\", \"lemmatizer\", \"ner\"]", "doc = nlp(\"terribly long\")", - "data = [(token.text, token..syllables, token..syllables_count) for token in doc]", + "data = [(token.text, token._.syllables, token._.syllables_count) for token in doc]", "assert data == [(\"terribly\", [\"ter\", \"ri\", \"bly\"], 3), (\"long\", [\"long\"], 1)]" ], "thumb": "https://raw.githubusercontent.com/sloev/spacy-syllables/master/logo.png", From 455f089c9bdbd8bb0951daeabb98f4c41d6c55f9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 25 Apr 2022 18:19:03 +0200 Subject: [PATCH 09/42] Support exclude in Doc.from_docs (#10689) * Support exclude in Doc.from_docs * Update API docs * Add new tag to docs --- spacy/tests/doc/test_doc_api.py | 23 +++++++++++ spacy/tokens/doc.pyx | 69 +++++++++++++++++++-------------- website/docs/api/doc.md | 19 +++++---- 3 files changed, 74 insertions(+), 37 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 858c7cbb6..19b554572 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,6 +1,7 @@ import weakref import numpy +from numpy.testing import assert_array_equal import pytest from thinc.api import NumpyOps, get_current_ops @@ -634,6 +635,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + # can exclude spans + m_doc = Doc.from_docs(en_docs, exclude=["spans"]) + assert "group" not in m_doc.spans + + # can exclude user_data + m_doc = Doc.from_docs(en_docs, exclude=["user_data"]) + assert m_doc.user_data == {} + # can merge empty docs doc = Doc.from_docs([en_tokenizer("")] * 10) @@ -647,6 +656,20 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert "group" in m_doc.spans assert len(m_doc.spans["group"]) == 0 + # with tensor + ops = get_current_ops() + for doc in en_docs: + doc.tensor = ops.asarray([[len(t.text), 0.0] for t in doc]) + m_doc = Doc.from_docs(en_docs) + assert_array_equal( + ops.to_numpy(m_doc.tensor), + ops.to_numpy(ops.xp.vstack([doc.tensor for doc in en_docs if len(doc)])), + ) + + # can exclude tensor + m_doc = Doc.from_docs(en_docs, exclude=["tensor"]) + assert m_doc.tensor.shape == (0,) + def test_doc_api_from_docs_ents(en_tokenizer): texts = ["Merging the docs is fun.", "They don't think alike."] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1a48705fd..c36e3a02f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -11,7 +11,7 @@ from enum import Enum import itertools import numpy import srsly -from thinc.api import get_array_module +from thinc.api import get_array_module, get_current_ops from thinc.util import copy_array import warnings @@ -1108,14 +1108,19 @@ cdef class Doc: return self @staticmethod - def from_docs(docs, ensure_whitespace=True, attrs=None): + def from_docs(docs, ensure_whitespace=True, attrs=None, *, exclude=tuple()): """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`. docs (list): A list of Doc objects. - ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace. - attrs (list): Optional list of attribute ID ints or attribute name strings. - RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given. + ensure_whitespace (bool): Insert a space between two adjacent docs + whenever the first doc does not end in whitespace. + attrs (list): Optional list of attribute ID ints or attribute name + strings. + exclude (Iterable[str]): Doc attributes to exclude. Supported + attributes: `spans`, `tensor`, `user_data`. + RETURNS (Doc): A doc that contains the concatenated docs, or None if no + docs were given. DOCS: https://spacy.io/api/doc#from_docs """ @@ -1145,31 +1150,33 @@ cdef class Doc: concat_words.extend(t.text for t in doc) concat_spaces.extend(bool(t.whitespace_) for t in doc) - for key, value in doc.user_data.items(): - if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.": - data_type, name, start, end = key - if start is not None or end is not None: - start += char_offset - if end is not None: - end += char_offset - concat_user_data[(data_type, name, start, end)] = copy.copy(value) + if "user_data" not in exclude: + for key, value in doc.user_data.items(): + if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.": + data_type, name, start, end = key + if start is not None or end is not None: + start += char_offset + if end is not None: + end += char_offset + concat_user_data[(data_type, name, start, end)] = copy.copy(value) + else: + warnings.warn(Warnings.W101.format(name=name)) else: - warnings.warn(Warnings.W101.format(name=name)) - else: - warnings.warn(Warnings.W102.format(key=key, value=value)) - for key in doc.spans: - # if a spans key is in any doc, include it in the merged doc - # even if it is empty - if key not in concat_spans: - concat_spans[key] = [] - for span in doc.spans[key]: - concat_spans[key].append(( - span.start_char + char_offset, - span.end_char + char_offset, - span.label, - span.kb_id, - span.text, # included as a check - )) + warnings.warn(Warnings.W102.format(key=key, value=value)) + if "spans" not in exclude: + for key in doc.spans: + # if a spans key is in any doc, include it in the merged doc + # even if it is empty + if key not in concat_spans: + concat_spans[key] = [] + for span in doc.spans[key]: + concat_spans[key].append(( + span.start_char + char_offset, + span.end_char + char_offset, + span.label, + span.kb_id, + span.text, # included as a check + )) char_offset += len(doc.text) if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_): char_offset += 1 @@ -1210,6 +1217,10 @@ cdef class Doc: else: raise ValueError(Errors.E873.format(key=key, text=text)) + if "tensor" not in exclude and any(len(doc) for doc in docs): + ops = get_current_ops() + concat_doc.tensor = ops.xp.vstack([ops.asarray(doc.tensor) for doc in docs if len(doc)]) + return concat_doc def get_lca_matrix(self): diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index c28509ab0..c929a4a06 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | Name | Description | | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `vocab` | A storage container for lexical types. ~~Vocab~~ | -| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | +| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | _keyword-only_ | | | `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | @@ -304,7 +304,8 @@ ancestor is found, e.g. if span excludes a necessary ancestor. ## Doc.has_annotation {#has_annotation tag="method"} -Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes). +Check whether the doc contains annotation on a +[`Token` attribute](/api/token#attributes). @@ -398,12 +399,14 @@ Concatenate multiple `Doc` objects to form a new one. Raises an error if the > [str(ent) for doc in docs for ent in doc.ents] > ``` -| Name | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------- | -| `docs` | A list of `Doc` objects. ~~List[Doc]~~ | -| `ensure_whitespace` | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. ~~bool~~ | -| `attrs` | Optional list of attribute ID ints or attribute name strings. ~~Optional[List[Union[str, int]]]~~ | -| **RETURNS** | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. ~~Optional[Doc]~~ | +| Name | Description | +| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | +| `docs` | A list of `Doc` objects. ~~List[Doc]~~ | +| `ensure_whitespace` | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. ~~bool~~ | +| `attrs` | Optional list of attribute ID ints or attribute name strings. ~~Optional[List[Union[str, int]]]~~ | +| _keyword-only_ | | +| `exclude` 3.3 | String names of Doc attributes to exclude. Supported: `spans`, `tensor`, `user_data`. ~~Iterable[str]~~ | +| **RETURNS** | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. ~~Optional[Doc]~~ | ## Doc.to_disk {#to_disk tag="method" new="2"} From b3717ba53a15a333c627526caa020fdfc44eb747 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 27 Apr 2022 09:14:25 +0200 Subject: [PATCH 10/42] removing print statements from the test suite (#10712) --- spacy/tests/lang/tr/test_tokenizer.py | 1 - spacy/tests/pipeline/test_morphologizer.py | 2 +- spacy/tests/test_cli.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/tests/lang/tr/test_tokenizer.py b/spacy/tests/lang/tr/test_tokenizer.py index 2ceca5068..9f988eae9 100644 --- a/spacy/tests/lang/tr/test_tokenizer.py +++ b/spacy/tests/lang/tr/test_tokenizer.py @@ -694,5 +694,4 @@ TESTS = ABBREV_TESTS + URL_TESTS + NUMBER_TESTS + PUNCT_TESTS + GENERAL_TESTS def test_tr_tokenizer_handles_allcases(tr_tokenizer, text, expected_tokens): tokens = tr_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] - print(token_list) assert expected_tokens == token_list diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 11d6f0477..33696bfd8 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -184,7 +184,7 @@ def test_overfitting_IO(): token.pos_ = "" token.set_morph(None) optimizer = nlp.initialize(get_examples=lambda: train_examples) - print(nlp.get_pipe("morphologizer").labels) + assert nlp.get_pipe("morphologizer").labels is not None for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 5e431d5cb..26a5710a8 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -217,7 +217,6 @@ def test_cli_converters_conllu_to_docs_subtokens(): sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 tokens = sent["tokens"] - print(tokens) assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."] assert [t["tag"] for t in tokens] == [ "NOUN__Definite=Ind|Gender=Masc|Number=Sing", From c066fb8a4ee27bf0f90a60a863158ab12fc05fb3 Mon Sep 17 00:00:00 2001 From: harmbuisman Date: Wed, 27 Apr 2022 09:51:58 +0200 Subject: [PATCH 11/42] #10672: fixes displacy output for manual unsorted entities (#10673) * #10672: fixes displacy output for manual unsorted entities * #10672: removed unused import * fix prettier formatting Co-authored-by: Harm Buisman Co-authored-by: Sofie Van Landeghem --- .pre-commit-config.yaml | 1 + spacy/displacy/__init__.py | 6 +++++- spacy/tests/test_displacy.py | 15 +++++++++++++++ website/docs/api/top-level.md | 2 +- website/docs/usage/visualizers.md | 4 +--- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7a12fd24..bd1baf5f7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,7 @@ repos: hooks: - id: black language_version: python3.7 + additional_dependencies: ['click==8.0.4'] - repo: https://gitlab.com/pycqa/flake8 rev: 3.9.2 hooks: diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index aa00c95d8..5d49b6eb7 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -4,7 +4,7 @@ spaCy's built in visualization suite for dependencies and named entities. DOCS: https://spacy.io/api/top-level#displacy USAGE: https://spacy.io/usage/visualizers """ -from typing import List, Union, Iterable, Optional, Dict, Any, Callable +from typing import Union, Iterable, Optional, Dict, Any, Callable import warnings from .render import DependencyRenderer, EntityRenderer, SpanRenderer @@ -56,6 +56,10 @@ def render( renderer_func, converter = factories[style] renderer = renderer_func(options=options) parsed = [converter(doc, options) for doc in docs] if not manual else docs # type: ignore + if manual: + for doc in docs: + if isinstance(doc, dict) and "ents" in doc: + doc["ents"] = sorted(doc["ents"], key=lambda x: (x["start"], x["end"])) _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() # type: ignore html = _html["parsed"] if RENDER_WRAPPER is not None: diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index f52c36889..ccc145b44 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -338,3 +338,18 @@ def test_displacy_options_case(): assert "green" in result[1] and "bar" in result[1] assert "red" in result[2] and "FOO" in result[2] assert "green" in result[3] and "BAR" in result[3] + + +@pytest.mark.issue(10672) +def test_displacy_manual_sorted_entities(): + doc = { + "text": "But Google is starting from behind.", + "ents": [ + {"start": 14, "end": 22, "label": "SECOND"}, + {"start": 4, "end": 10, "label": "FIRST"}, + ], + "title": None, + } + + html = displacy.render(doc, style="ent", manual=True) + assert html.find("FIRST") < html.find("SECOND") diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 6d7431f28..f2fd1415f 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -263,7 +263,7 @@ Render a dependency parse tree or named entity visualization. | Name | Description | | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | +| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ | | `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index f98c43224..770448c5a 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -342,9 +342,7 @@ want to visualize output from other libraries, like [NLTK](http://www.nltk.org) or [SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet). If you set `manual=True` on either `render()` or `serve()`, you can pass in data -in displaCy's format (instead of `Doc` objects). When setting `ents` manually, -make sure to supply them in the right order, i.e. starting with the lowest start -position. +in displaCy's format as a dictionary (instead of `Doc` objects). > #### Example > From 3579507ba102f6a1502ea634e364d955bd448f11 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 27 Apr 2022 14:49:24 +0200 Subject: [PATCH 12/42] Bumped black to 22.3.0 due to a fix for https://github.com/psf/black/issues/2964. (#10715) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bd1baf5f7..b959262e3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/ambv/black - rev: 21.6b0 + rev: 22.3.0 hooks: - id: black language_version: python3.7 From 10377fb945cc2eba222b2fd6e68db040669cc149 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 28 Apr 2022 13:07:49 +0200 Subject: [PATCH 13/42] Set version to v3.3.0 (#10614) * Set version to v3.3.0 * Revert "Temporarily skip tests that require models/compat" This reverts commit e422101e004a6211d5b05942c36698287d545383. --- .github/azure-steps.yml | 34 +++++++++++++++++----------------- spacy/about.py | 2 +- spacy/tests/test_cli.py | 1 - 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 742182bbe..80c88b0b8 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -64,12 +64,12 @@ steps: displayName: "Run GPU tests" condition: eq(${{ parameters.gpu }}, true) -# - script: | -# python -m spacy download ca_core_news_sm -# python -m spacy download ca_core_news_md -# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" -# displayName: 'Test download CLI' -# condition: eq(variables['python_version'], '3.8') + - script: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + displayName: 'Test download CLI' + condition: eq(variables['python_version'], '3.8') - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . @@ -93,17 +93,17 @@ steps: displayName: 'Test train CLI' condition: eq(variables['python_version'], '3.8') -# - script: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" -# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir -# displayName: 'Test assemble CLI' -# condition: eq(variables['python_version'], '3.8') -# -# - script: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" -# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 -# displayName: 'Test assemble CLI vectors warning' -# condition: eq(variables['python_version'], '3.8') + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + displayName: 'Test assemble CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + displayName: 'Test assemble CLI vectors warning' + condition: eq(variables['python_version'], '3.8') - script: | python .github/validate_universe_json.py website/meta/universe.json diff --git a/spacy/about.py b/spacy/about.py index 1985ba342..03eabc2e9 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.3.0.dev0" +__version__ = "3.3.0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 26a5710a8..0fa6f5670 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -584,7 +584,6 @@ def test_download_compatibility(): assert get_minor_version(about.__version__) == get_minor_version(version) -@pytest.mark.skip(reason="Temporarily skip for dev version") def test_validate_compatibility_table(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False From 497a708c7130cda95fd08b678cb7adc109ebde0e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 28 Apr 2022 14:09:35 +0200 Subject: [PATCH 14/42] Docs for v3.3 (#10628) * Temporarily disable CI tests * Start v3.3 website updates * Add trainable lemmatizer to pipeline design * Fix Vectors.most_similar * Add floret vector info to pipeline design * Add Lower and Upper Sorbian * Add span to sidebar * Work on release notes * Copy from release notes * Update pipeline design graphic * Upgrading note about Doc.from_docs * Add tables and details * Update website/docs/models/index.md Co-authored-by: Sofie Van Landeghem * Fix da lemma acc * Add minimal intro, various updates * Round lemma acc * Add section on floret / word lists * Add new pipelines table, minor edits * Fix displacy spans example title * Clarify adding non-trainable lemmatizer * Update adding-languages URLs * Revert "Temporarily disable CI tests" This reverts commit 1dee505920783dfad56282267c29e7f1209f5131. * Spell out words/sec Co-authored-by: Sofie Van Landeghem --- website/docs/api/doc.md | 2 +- website/docs/api/span.md | 16 +- website/docs/api/vectors.md | 16 +- website/docs/images/pipeline-design.svg | 103 +++++----- website/docs/models/index.md | 85 ++++++-- website/docs/usage/v3-3.md | 247 ++++++++++++++++++++++++ website/docs/usage/visualizers.md | 3 +- website/meta/languages.json | 10 + website/meta/sidebars.json | 3 +- website/src/templates/index.js | 4 +- 10 files changed, 407 insertions(+), 82 deletions(-) create mode 100644 website/docs/usage/v3-3.md diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index c929a4a06..0008cde31 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -621,7 +621,7 @@ relative clauses. To customize the noun chunk iterator in a loaded pipeline, modify [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` -[syntax iterator](/usage/adding-languages#language-data) has not been +[syntax iterator](/usage/linguistic-features#language-data) has not been implemented for the given language, a `NotImplementedError` is raised. > #### Example diff --git a/website/docs/api/span.md b/website/docs/api/span.md index ff7905bc0..d765a199c 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -283,8 +283,9 @@ objects, if the document has been syntactically parsed. A base noun phrase, or it – so no NP-level coordination, no prepositional phrases, and no relative clauses. -If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has -not been implemeted for the given language, a `NotImplementedError` is raised. +If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) +has not been implemeted for the given language, a `NotImplementedError` is +raised. > #### Example > @@ -520,12 +521,13 @@ sent = doc[sent.start : max(sent.end, span.end)] ## Span.sents {#sents tag="property" model="sentences" new="3.2.1"} -Returns a generator over the sentences the span belongs to. This property is only available -when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the -document by the `parser`, `senter`, `sentencizer` or some custom function. It -will raise an error otherwise. +Returns a generator over the sentences the span belongs to. This property is +only available when [sentence boundaries](/usage/linguistic-features#sbd) have +been set on the document by the `parser`, `senter`, `sentencizer` or some custom +function. It will raise an error otherwise. -If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned. +If the span happens to cross sentence boundaries, all sentences the span +overlaps with will be returned. > #### Example > diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index a651c23b0..9636ea04c 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -347,14 +347,14 @@ supported for `floret` mode. > most_similar = nlp.vocab.vectors.most_similar(queries, n=10) > ``` -| Name | Description | -| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| `queries` | An array with one or more vectors. ~~numpy.ndarray~~ | -| _keyword-only_ | | -| `batch_size` | The batch size to use. Default to `1024`. ~~int~~ | -| `n` | The number of entries to return for each query. Defaults to `1`. ~~int~~ | -| `sort` | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ | -| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------- | +| `queries` | An array with one or more vectors. ~~numpy.ndarray~~ | +| _keyword-only_ | | +| `batch_size` | The batch size to use. Default to `1024`. ~~int~~ | +| `n` | The number of entries to return for each query. Defaults to `1`. ~~int~~ | +| `sort` | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ | ## Vectors.get_batch {#get_batch tag="method" new="3.2"} diff --git a/website/docs/images/pipeline-design.svg b/website/docs/images/pipeline-design.svg index 88ccdab99..3b528eae5 100644 --- a/website/docs/images/pipeline-design.svg +++ b/website/docs/images/pipeline-design.svg @@ -1,49 +1,56 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 92d1b0172..9ee96528e 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -30,10 +30,16 @@ into three components: tagging, parsing, lemmatization and named entity recognition, or `dep` for only tagging, parsing and lemmatization). 2. **Genre:** Type of text the pipeline is trained on, e.g. `web` or `news`. -3. **Size:** Package size indicator, `sm`, `md`, `lg` or `trf` (`sm`: no word - vectors, `md`: reduced word vector table with 20k unique vectors for ~500k - words, `lg`: large word vector table with ~500k entries, `trf`: transformer - pipeline without static word vectors) +3. **Size:** Package size indicator, `sm`, `md`, `lg` or `trf`. + + `sm` and `trf` pipelines have no static word vectors. + + For pipelines with default vectors, `md` has a reduced word vector table with + 20k unique vectors for ~500k words and `lg` has a large word vector table + with ~500k entries. + + For pipelines with floret vectors, `md` vector tables have 50k entries and + `lg` vector tables have 200k entries. For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English pipeline trained on written web text (blogs, news, comments), that includes @@ -90,19 +96,42 @@ Main changes from spaCy v2 models: In the `sm`/`md`/`lg` models: - The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec` - component. + component. If the lemmatizer is trainable (v3.3+), `lemmatizer` also listens + to `tok2vec`. - The `attribute_ruler` maps `token.tag` to `token.pos` if there is no `morphologizer`. The `attribute_ruler` additionally makes sure whitespace is tagged consistently and copies `token.pos` to `token.tag` if there is no tagger. For English, the attribute ruler can improve its mapping from `token.tag` to `token.pos` if dependency parses from a `parser` are present, but the parser is not required. -- The `lemmatizer` component for many languages (Catalan, Dutch, English, - French, Greek, Italian Macedonian, Norwegian, Polish and Spanish) requires - `token.pos` annotation from either `tagger`+`attribute_ruler` or - `morphologizer`. +- The `lemmatizer` component for many languages requires `token.pos` annotation + from either `tagger`+`attribute_ruler` or `morphologizer`. - The `ner` component is independent with its own internal tok2vec layer. +#### CNN/CPU pipelines with floret vectors + +The Finnish, Korean and Swedish `md` and `lg` pipelines use +[floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're +running a trained pipeline on texts and working with [`Doc`](/api/doc) objects, +you shouldn't notice any difference with floret vectors. With floret vectors no +tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will +return `True` for all tokens. + +If you access vectors directly for similarity comparisons, there are a few +differences because floret vectors don't include a fixed word list like the +vector keys for default vectors. + +- If your workflow iterates over the vector keys, you need to use an external + word list instead: + + ```diff + - lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors] + + lexemes = [nlp.vocab[word] for word in external_word_list] + ``` + +- [`Vectors.most_similar`](/api/vectors#most_similar) is not supported because + there's no fixed list of vectors to compare your vectors to. + ### Transformer pipeline design {#design-trf} In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present) @@ -133,10 +162,14 @@ nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemma -The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for -Catalan, Dutch, English, French, Greek, Italian, Macedonian, Norwegian, Polish -and Spanish. If you disable any of these components, you'll see lemmatizer -warnings unless the lemmatizer is also disabled. +The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for a +number of languages. If you disable any of these components, you'll see +lemmatizer warnings unless the lemmatizer is also disabled. + +**v3.3**: Catalan, English, French, Russian and Spanish + +**v3.0-v3.2**: Catalan, Dutch, English, French, Greek, Italian, Macedonian, +Norwegian, Polish, Russian and Spanish @@ -154,10 +187,34 @@ nlp.enable_pipe("senter") The `senter` component is ~10× faster than the parser and more accurate than the rule-based `sentencizer`. +#### Switch from trainable lemmatizer to default lemmatizer + +Since v3.3, a number of pipelines use a trainable lemmatizer. You can check whether +the lemmatizer is trainable: + +```python +nlp = spacy.load("de_core_web_sm") +assert nlp.get_pipe("lemmatizer").is_trainable +``` + +If you'd like to switch to a non-trainable lemmatizer that's similar to v3.2 or +earlier, you can replace the trainable lemmatizer with the default non-trainable +lemmatizer: + +```python +# Requirements: pip install spacy-lookups-data +nlp = spacy.load("de_core_web_sm") +# Remove existing lemmatizer +nlp.remove_pipe("lemmatizer") +# Add non-trainable lemmatizer from language defaults +# and load lemmatizer tables from spacy-lookups-data +nlp.add_pipe("lemmatizer").initialize() +``` + #### Switch from rule-based to lookup lemmatization For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish -pipelines, you can switch from the default rule-based lemmatizer to a lookup +pipelines, you can swap out a trainable or rule-based lemmatizer for a lookup lemmatizer: ```python diff --git a/website/docs/usage/v3-3.md b/website/docs/usage/v3-3.md new file mode 100644 index 000000000..739e2a2f9 --- /dev/null +++ b/website/docs/usage/v3-3.md @@ -0,0 +1,247 @@ +--- +title: What's New in v3.3 +teaser: New features and how to upgrade +menu: + - ['New Features', 'features'] + - ['Upgrading Notes', 'upgrading'] +--- + +## New features {#features hidden="true"} + +spaCy v3.3 improves the speed of core pipeline components, adds a new trainable +lemmatizer, and introduces trained pipelines for Finnish, Korean and Swedish. + +### Speed improvements {#speed} + +v3.3 includes a slew of speed improvements: + +- Speed up parser and NER by using constant-time head lookups. +- Support unnormalized softmax probabilities in `spacy.Tagger.v2` to speed up + inference for tagger, morphologizer, senter and trainable lemmatizer. +- Speed up parser projectivization functions. +- Replace `Ragged` with faster `AlignmentArray` in `Example` for training. +- Improve `Matcher` speed. +- Improve serialization speed for empty `Doc.spans`. + +For longer texts, the trained pipeline speeds improve **15%** or more in +prediction. We benchmarked `en_core_web_md` (same components as in v3.2) and +`de_core_news_md` (with the new trainable lemmatizer) across a range of text +sizes on Linux (Intel Xeon W-2265) and OS X (M1) to compare spaCy v3.2 vs. v3.3: + +**Intel Xeon W-2265** + +| Model | Avg. Words/Doc | v3.2 Words/Sec | v3.3 Words/Sec | Diff | +| :----------------------------------------------- | -------------: | -------------: | -------------: | -----: | +| [`en_core_web_md`](/models/en#en_core_web_md) | 100 | 17292 | 17441 | 0.86% | +| (=same components) | 1000 | 15408 | 16024 | 4.00% | +| | 10000 | 12798 | 15346 | 19.91% | +| [`de_core_news_md`](/models/de/#de_core_news_md) | 100 | 20221 | 19321 | -4.45% | +| (+v3.3 trainable lemmatizer) | 1000 | 17480 | 17345 | -0.77% | +| | 10000 | 14513 | 17036 | 17.38% | + +**Apple M1** + +| Model | Avg. Words/Doc | v3.2 Words/Sec | v3.3 Words/Sec | Diff | +| ------------------------------------------------ | -------------: | -------------: | -------------: | -----: | +| [`en_core_web_md`](/models/en#en_core_web_md) | 100 | 18272 | 18408 | 0.74% | +| (=same components) | 1000 | 18794 | 19248 | 2.42% | +| | 10000 | 15144 | 17513 | 15.64% | +| [`de_core_news_md`](/models/de/#de_core_news_md) | 100 | 19227 | 19591 | 1.89% | +| (+v3.3 trainable lemmatizer) | 1000 | 20047 | 20628 | 2.90% | +| | 10000 | 15921 | 18546 | 16.49% | + +### Trainable lemmatizer {#trainable-lemmatizer} + +The new [trainable lemmatizer](/api/edittreelemmatizer) component uses +[edit trees](https://explosion.ai/blog/edit-tree-lemmatizer) to transform tokens +into lemmas. Try out the trainable lemmatizer with the +[training quickstart](/usage/training#quickstart)! + +### displaCy support for overlapping spans and arcs {#displacy} + +displaCy now supports overlapping spans with a new +[`span`](/usage/visualizers#span) style and multiple arcs with different labels +between the same tokens for [`dep`](/usage/visualizers#dep) visualizations. + +Overlapping spans can be visualized for any spans key in `doc.spans`: + +```python +import spacy +from spacy import displacy +from spacy.tokens import Span + +nlp = spacy.blank("en") +text = "Welcome to the Bank of China." +doc = nlp(text) +doc.spans["custom"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")] +displacy.serve(doc, style="span", options={"spans_key": "custom"}) +``` + +import DisplacySpanHtml from 'images/displacy-span.html' + + + +## Additional features and improvements + +- Config comparisons with [`spacy debug diff-config`](/api/cli#debug-diff). +- Span suggester debugging with + [`SpanCategorizer.set_candidates`](/api/spancategorizer#set_candidates). +- Big endian support with + [`thinc-bigendian-ops`](https://github.com/andrewsi-z/thinc-bigendian-ops) and + updates to make `floret`, `murmurhash`, Thinc and spaCy endian neutral. +- Initial support for Lower Sorbian and Upper Sorbian. +- Language updates for English, French, Italian, Japanese, Korean, Norwegian, + Russian, Slovenian, Spanish, Turkish, Ukrainian and Vietnamese. +- New noun chunks for Finnish. + +## Trained pipelines {#pipelines} + +### New trained pipelines {#new-pipelines} + +v3.3 introduces new CPU/CNN pipelines for Finnish, Korean and Swedish, which use +the new trainable lemmatizer and +[floret vectors](https://github.com/explosion/floret). Due to the use +[Bloom embeddings](https://explosion.ai/blog/bloom-embeddings) and subwords, the +pipelines have compact vectors with no out-of-vocabulary words. + +| Package | Language | UPOS | Parser LAS | NER F | +| ----------------------------------------------- | -------- | ---: | ---------: | ----: | +| [`fi_core_news_sm`](/models/fi#fi_core_news_sm) | Finnish | 92.5 | 71.9 | 75.9 | +| [`fi_core_news_md`](/models/fi#fi_core_news_md) | Finnish | 95.9 | 78.6 | 80.6 | +| [`fi_core_news_lg`](/models/fi#fi_core_news_lg) | Finnish | 96.2 | 79.4 | 82.4 | +| [`ko_core_news_sm`](/models/ko#ko_core_news_sm) | Korean | 86.1 | 65.6 | 71.3 | +| [`ko_core_news_md`](/models/ko#ko_core_news_md) | Korean | 94.7 | 80.9 | 83.1 | +| [`ko_core_news_lg`](/models/ko#ko_core_news_lg) | Korean | 94.7 | 81.3 | 85.3 | +| [`sv_core_news_sm`](/models/sv#sv_core_news_sm) | Swedish | 95.0 | 75.9 | 74.7 | +| [`sv_core_news_md`](/models/sv#sv_core_news_md) | Swedish | 96.3 | 78.5 | 79.3 | +| [`sv_core_news_lg`](/models/sv#sv_core_news_lg) | Swedish | 96.3 | 79.1 | 81.1 | + +### Pipeline updates {#pipeline-updates} + +The following languages switch from lookup or rule-based lemmatizers to the new +trainable lemmatizer: Danish, Dutch, German, Greek, Italian, Lithuanian, +Norwegian, Polish, Portuguese and Romanian. The overall lemmatizer accuracy +improves for all of these pipelines, but be aware that the types of errors may +look quite different from the lookup-based lemmatizers. If you'd prefer to +continue using the previous lemmatizer, you can +[switch from the trainable lemmatizer to a non-trainable lemmatizer](/models#design-modify). + +
+ +| Model | v3.2 Lemma Acc | v3.3 Lemma Acc | +| ----------------------------------------------- | -------------: | -------------: | +| [`da_core_news_md`](/models/da#da_core_news_md) | 84.9 | 94.8 | +| [`de_core_news_md`](/models/de#de_core_news_md) | 73.4 | 97.7 | +| [`el_core_news_md`](/models/el#el_core_news_md) | 56.5 | 88.9 | +| [`fi_core_news_md`](/models/fi#fi_core_news_md) | - | 86.2 | +| [`it_core_news_md`](/models/it#it_core_news_md) | 86.6 | 97.2 | +| [`ko_core_news_md`](/models/ko#ko_core_news_md) | - | 90.0 | +| [`lt_core_news_md`](/models/lt#lt_core_news_md) | 71.1 | 84.8 | +| [`nb_core_news_md`](/models/nb#nb_core_news_md) | 76.7 | 97.1 | +| [`nl_core_news_md`](/models/nl#nl_core_news_md) | 81.5 | 94.0 | +| [`pl_core_news_md`](/models/pl#pl_core_news_md) | 87.1 | 93.7 | +| [`pt_core_news_md`](/models/pt#pt_core_news_md) | 76.7 | 96.9 | +| [`ro_core_news_md`](/models/ro#ro_core_news_md) | 81.8 | 95.5 | +| [`sv_core_news_md`](/models/sv#sv_core_news_md) | - | 95.5 | + +
+ +In addition, the vectors in the English pipelines are deduplicated to improve +the pruned vectors in the `md` models and reduce the `lg` model size. + +## Notes about upgrading from v3.2 {#upgrading} + +### Span comparisons + +Span comparisons involving ordering (`<`, `<=`, `>`, `>=`) now take all span +attributes into account (start, end, label, and KB ID) so spans may be sorted in +a slightly different order. + +### Whitespace annotation + +During training, annotation on whitespace tokens is handled in the same way as +annotation on non-whitespace tokens in order to allow custom whitespace +annotation. + +### Doc.from_docs + +[`Doc.from_docs`](/api/doc#from_docs) now includes `Doc.tensor` by default and +supports excludes with an `exclude` argument in the same format as +`Doc.to_bytes`. The supported exclude fields are `spans`, `tensor` and +`user_data`. + +Docs including `Doc.tensor` may be quite a bit larger in RAM, so to exclude +`Doc.tensor` as in v3.2: + +```diff +-merged_doc = Doc.from_docs(docs) ++merged_doc = Doc.from_docs(docs, exclude=["tensor"]) +``` + +### Using trained pipelines with floret vectors + +If you're running a new trained pipeline for Finnish, Korean or Swedish on new +texts and working with `Doc` objects, you shouldn't notice any difference with +floret vectors vs. default vectors. + +If you use vectors for similarity comparisons, there are a few differences, +mainly because a floret pipeline doesn't include any kind of frequency-based +word list similar to the list of in-vocabulary vector keys with default vectors. + +- If your workflow iterates over the vector keys, you should use an external + word list instead: + + ```diff + - lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors] + + lexemes = [nlp.vocab[word] for word in external_word_list] + ``` + +- `Vectors.most_similar` is not supported because there's no fixed list of + vectors to compare your vectors to. + +### Pipeline package version compatibility {#version-compat} + +> #### Using legacy implementations +> +> In spaCy v3, you'll still be able to load and reference legacy implementations +> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the +> components or architectures change and newer versions are available in the +> core library. + +When you're loading a pipeline package trained with an earlier version of spaCy +v3, you will see a warning telling you that the pipeline may be incompatible. +This doesn't necessarily have to be true, but we recommend running your +pipelines against your test suite or evaluation data to make sure there are no +unexpected results. + +If you're using one of the [trained pipelines](/models) we provide, you should +run [`spacy download`](/api/cli#download) to update to the latest version. To +see an overview of all installed packages and their compatibility, you can run +[`spacy validate`](/api/cli#validate). + +If you've trained your own custom pipeline and you've confirmed that it's still +working as expected, you can update the spaCy version requirements in the +[`meta.json`](/api/data-formats#meta): + +```diff +- "spacy_version": ">=3.2.0,<3.3.0", ++ "spacy_version": ">=3.2.0,<3.4.0", +``` + +### Updating v3.2 configs + +To update a config from spaCy v3.2 with the new v3.3 settings, run +[`init fill-config`](/api/cli#init-fill-config): + +```cli +$ python -m spacy init fill-config config-v3.2.cfg config-v3.3.cfg +``` + +In many cases ([`spacy train`](/api/cli#train), +[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in +automatically, but you'll need to fill in the new settings to run +[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data). + +To see the speed improvements for the +[`Tagger` architecture](/api/architectures#Tagger), edit your config to switch +from `spacy.Tagger.v1` to `spacy.Tagger.v2` and then run `init fill-config`. diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 770448c5a..d2892b863 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -5,6 +5,7 @@ new: 2 menu: - ['Dependencies', 'dep'] - ['Named Entities', 'ent'] + - ['Spans', 'span'] - ['Jupyter Notebooks', 'jupyter'] - ['Rendering HTML', 'html'] - ['Web app usage', 'webapp'] @@ -192,7 +193,7 @@ displacy.serve(doc, style="span") import DisplacySpanHtml from 'images/displacy-span.html' - + The span visualizer lets you customize the following `options`: diff --git a/website/meta/languages.json b/website/meta/languages.json index 1c4379b6d..64ca7a082 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -62,6 +62,11 @@ "example": "Dies ist ein Satz.", "has_examples": true }, + { + "code": "dsb", + "name": "Lower Sorbian", + "has_examples": true + }, { "code": "el", "name": "Greek", @@ -159,6 +164,11 @@ "name": "Croatian", "has_examples": true }, + { + "code": "hsb", + "name": "Upper Sorbian", + "has_examples": true + }, { "code": "hu", "name": "Hungarian", diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 2229c91f3..cf3f1398e 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -11,7 +11,8 @@ { "text": "spaCy 101", "url": "/usage/spacy-101" }, { "text": "New in v3.0", "url": "/usage/v3" }, { "text": "New in v3.1", "url": "/usage/v3-1" }, - { "text": "New in v3.2", "url": "/usage/v3-2" } + { "text": "New in v3.2", "url": "/usage/v3-2" }, + { "text": "New in v3.3", "url": "/usage/v3-3" } ] }, { diff --git a/website/src/templates/index.js b/website/src/templates/index.js index dfd59e424..bdbdbd431 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -120,8 +120,8 @@ const AlertSpace = ({ nightly, legacy }) => { } const navAlert = ( -
💥 Out now: spaCy v3.2 + + 💥 Out now: spaCy v3.3 ) From f3de976513706b6a0014a606aff047dc4a8bbc41 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Mon, 2 May 2022 13:35:14 +0200 Subject: [PATCH 15/42] Update universe.json to Include spaCy video #6 (#10723) * Update universe.json I noticed that episode 6 was missing, so I added it. * Update universe.json * Update universe.json --- website/meta/universe.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index e67c78716..11a174c8c 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2087,6 +2087,20 @@ "youtube": "f4sqeLRzkPg", "category": ["videos"] }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-6", + "title": "Intro to NLP with spaCy (6)", + "slogan": "Episode 6: Moving to spaCy v3", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "k77RrmMaKEI", + "category": ["videos"] + }, { "type": "education", "id": "video-spacy-irl-entity-linking", From 0a503ce5e0697c617d9bfae8cc0dc5141e3673f9 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 2 May 2022 13:36:35 +0200 Subject: [PATCH 16/42] Remove vestigial debug print statement in `walk_head_nodes` (#10718) * `graph`: Remove vestigial debug print statement in `walk_head_nodes` * Revert whitespace changes * Remove more debug print statements --- spacy/tokens/graph.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx index 9351435f8..9d64f924e 100644 --- a/spacy/tokens/graph.pyx +++ b/spacy/tokens/graph.pyx @@ -484,7 +484,6 @@ cdef class Graph: for idx in indices: node.push_back(idx) i = add_node(&self.c, node) - print("Add node", indices, i) return Node(self, i) def get_node(self, indices) -> Node: @@ -501,7 +500,6 @@ cdef class Graph: if node_index < 0: return NoneNode(self) else: - print("Get node", indices, node_index) return Node(self, node_index) def has_node(self, tuple indices) -> bool: @@ -661,8 +659,6 @@ cdef int walk_head_nodes(vector[int]& output, const GraphC* graph, int node) nog seen.insert(node) i = 0 while i < output.size(): - with gil: - print("Walk up from", output[i]) if seen.find(output[i]) == seen.end(): seen.insert(output[i]) get_head_nodes(output, graph, output[i]) From f5390e278a886663aa3a8e850c677b357c0ca119 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 2 May 2022 13:38:46 +0200 Subject: [PATCH 17/42] Refactor error messages to remove hardcoded strings (#10729) * Use custom error msg instead of hardcoded string: replaced remaining hardcoded error message strings. * Use custom error msg instead of hardcoded string: fixing faulty Errors import. --- spacy/errors.py | 10 +++++++++- spacy/ml/parser_model.pyx | 5 +++-- spacy/pipeline/_edit_tree_internals/edit_trees.pyx | 8 ++++---- spacy/pipeline/_parser_internals/arc_eager.pyx | 2 +- spacy/tokens/graph.pyx | 8 +++++--- spacy/tokens/span.pyx | 8 ++++---- 6 files changed, 26 insertions(+), 15 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index abf710d7c..b01afcb80 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -905,7 +905,15 @@ class Errors(metaclass=ErrorsWithCodes): E1026 = ("Edit tree has an invalid format:\n{errors}") E1027 = ("AlignmentArray only supports slicing with a step of 1.") E1028 = ("AlignmentArray only supports indexing using an int or a slice.") - + E1029 = ("Edit tree cannot be applied to form.") + E1030 = ("Edit tree identifier out of range.") + E1031 = ("Could not find gold transition - see logs above.") + E1032 = ("`{var}` should not be {forbidden}, but received {value}.") + E1033 = ("Dimension {name} invalid -- only nO, nF, nP") + E1034 = ("Node index {i} out of bounds ({length})") + E1035 = ("Token index {i} out of bounds ({length})") + E1036 = ("Cannot index into NoneNode") + # Deprecated model shortcuts, only used in errors and warnings OLD_MODEL_SHORTCUTS = { diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx index da937ca4f..4e854178d 100644 --- a/spacy/ml/parser_model.pyx +++ b/spacy/ml/parser_model.pyx @@ -11,6 +11,7 @@ import numpy.random from thinc.api import Model, CupyOps, NumpyOps from .. import util +from ..errors import Errors from ..typedefs cimport weight_t, class_t, hash_t from ..pipeline._parser_internals.stateclass cimport StateClass @@ -411,7 +412,7 @@ cdef class precompute_hiddens: elif name == "nO": return self.nO else: - raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") + raise ValueError(Errors.E1033.format(name=name)) def set_dim(self, name, value): if name == "nF": @@ -421,7 +422,7 @@ cdef class precompute_hiddens: elif name == "nO": self.nO = value else: - raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") + raise ValueError(Errors.E1033.format(name=name)) def __call__(self, X, bint is_train): if is_train: diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx index 02907b67a..9d18c0334 100644 --- a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx +++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx @@ -132,7 +132,7 @@ cdef class EditTrees: could not be applied to the form. """ if tree_id >= self.trees.size(): - raise IndexError("Edit tree identifier out of range") + raise IndexError(Errors.E1030) lemma_pieces = [] try: @@ -154,7 +154,7 @@ cdef class EditTrees: match_node = tree.inner.match_node if match_node.prefix_len + match_node.suffix_len > len(form_part): - raise ValueError("Edit tree cannot be applied to form") + raise ValueError(Errors.E1029) suffix_start = len(form_part) - match_node.suffix_len @@ -169,7 +169,7 @@ cdef class EditTrees: if form_part == self.strings[tree.inner.subst_node.orig]: lemma_pieces.append(self.strings[tree.inner.subst_node.subst]) else: - raise ValueError("Edit tree cannot be applied to form") + raise ValueError(Errors.E1029) cpdef unicode tree_to_str(self, uint32_t tree_id): """Return the tree as a string. The tree tree string is formatted @@ -187,7 +187,7 @@ cdef class EditTrees: """ if tree_id >= self.trees.size(): - raise IndexError("Edit tree identifier out of range") + raise IndexError(Errors.E1030) cdef EditTreeC tree = self.trees[tree_id] cdef SubstNodeC subst_node diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index f1165592e..d60f1c3e6 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -824,7 +824,7 @@ cdef class ArcEager(TransitionSystem): for i in range(self.n_moves): print(self.get_class_name(i), is_valid[i], costs[i]) print("Gold sent starts?", is_sent_start(&gold_state, state.B(0)), is_sent_start(&gold_state, state.B(1))) - raise ValueError("Could not find gold transition - see logs above.") + raise ValueError(Errors.E1031) def get_oracle_sequence_from_state(self, StateClass state, ArcEagerGold gold, _debug=None): cdef int i diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx index 9d64f924e..adc4d23c8 100644 --- a/spacy/tokens/graph.pyx +++ b/spacy/tokens/graph.pyx @@ -9,6 +9,8 @@ cimport cython import weakref from preshed.maps cimport map_get_unless_missing from murmurhash.mrmr cimport hash64 + +from .. import Errors from ..typedefs cimport hash_t from ..strings import get_string_id from ..structs cimport EdgeC, GraphC @@ -68,7 +70,7 @@ cdef class Node: """ cdef int length = graph.c.nodes.size() if i >= length or -i >= length: - raise IndexError(f"Node index {i} out of bounds ({length})") + raise IndexError(Errors.E1034.format(i=i, length=length)) if i < 0: i += length self.graph = graph @@ -88,7 +90,7 @@ cdef class Node: """Get a token index from the node's set of tokens.""" length = self.graph.c.nodes[self.i].size() if i >= length or -i >= length: - raise IndexError(f"Token index {i} out of bounds ({length})") + raise IndexError(Errors.E1035.format(i=i, length=length)) if i < 0: i += length return self.graph.c.nodes[self.i][i] @@ -306,7 +308,7 @@ cdef class NoneNode(Node): self.i = -1 def __getitem__(self, int i): - raise IndexError("Cannot index into NoneNode.") + raise IndexError(Errors.E1036) def __len__(self): return 0 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 4b0c724e5..305d7caf4 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -730,7 +730,7 @@ cdef class Span: def __set__(self, int start): if start < 0: - raise IndexError("TODO") + raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start)) self.c.start = start property end: @@ -739,7 +739,7 @@ cdef class Span: def __set__(self, int end): if end < 0: - raise IndexError("TODO") + raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end)) self.c.end = end property start_char: @@ -748,7 +748,7 @@ cdef class Span: def __set__(self, int start_char): if start_char < 0: - raise IndexError("TODO") + raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char)) self.c.start_char = start_char property end_char: @@ -757,7 +757,7 @@ cdef class Span: def __set__(self, int end_char): if end_char < 0: - raise IndexError("TODO") + raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char)) self.c.end_char = end_char property label: From e03b9f8095b2f98c1761cfb3020fe77994e9042d Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 3 May 2022 13:55:27 +0200 Subject: [PATCH 18/42] Small doc typos (#10750) * fix typos * formatting --- website/docs/usage/rule-based-matching.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 710c52dfd..be9a56dc8 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -949,7 +949,7 @@ for match_id, start, end in matcher(doc): The examples here use [`nlp.make_doc`](/api/language#make_doc) to create `Doc` object patterns as efficiently as possible and without running any of the other -pipeline components. If the token attribute you want to match on are set by a +pipeline components. If the token attribute you want to match on is set by a pipeline component, **make sure that the pipeline component runs** when you create the pattern. For example, to match on `POS` or `LEMMA`, the pattern `Doc` objects need to have part-of-speech tags set by the `tagger` or `morphologizer`. @@ -960,9 +960,9 @@ disable components selectively.
Another possible use case is matching number tokens like IP addresses based on -their shape. This means that you won't have to worry about how those string will -be tokenized and you'll be able to find tokens and combinations of tokens based -on a few examples. Here, we're matching on the shapes `ddd.d.d.d` and +their shape. This means that you won't have to worry about how those strings +will be tokenized and you'll be able to find tokens and combinations of tokens +based on a few examples. Here, we're matching on the shapes `ddd.d.d.d` and `ddd.ddd.d.d`: ```python @@ -1433,7 +1433,7 @@ of `"phrase_matcher_attr": "POS"` for the entity ruler. Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. As of spaCy v2.2.4 the `add_patterns` function has been refactored to use -nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with +`nlp.pipe` on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. An easy workaround to make this function run faster is From 0a92d5644ef3fcb201f284e623ada019c4ab0b8b Mon Sep 17 00:00:00 2001 From: Luca Dorigo Date: Tue, 3 May 2022 17:57:07 +0200 Subject: [PATCH 19/42] Fix StringStore.__getitem__ return type depending on parameter types (#10741) * Fix StringStore.__getitem__ return type depending on parameter types Small fix using `@overload` so that `StringStore.__getitem__` returns an `int` when given a `str` or `bytes` and a `str` when given an `int`. * Update spacy/strings.pyi Co-authored-by: Adriane Boyd --- spacy/strings.pyi | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/strings.pyi b/spacy/strings.pyi index 5b4147e12..b29389b9a 100644 --- a/spacy/strings.pyi +++ b/spacy/strings.pyi @@ -1,4 +1,4 @@ -from typing import Optional, Iterable, Iterator, Union, Any +from typing import Optional, Iterable, Iterator, Union, Any, overload from pathlib import Path def get_string_id(key: Union[str, int]) -> int: ... @@ -7,7 +7,10 @@ class StringStore: def __init__( self, strings: Optional[Iterable[str]] = ..., freeze: bool = ... ) -> None: ... - def __getitem__(self, string_or_id: Union[bytes, str, int]) -> Union[str, int]: ... + @overload + def __getitem__(self, string_or_id: Union[bytes, str]) -> int: ... + @overload + def __getitem__(self, string_or_id: int) -> str: ... def as_int(self, key: Union[bytes, str, int]) -> int: ... def as_string(self, key: Union[bytes, str, int]) -> str: ... def add(self, string: str) -> int: ... From c32e1a0079f7436e6e58b976de8903bf48ffb36f Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Fri, 6 May 2022 13:21:39 +0200 Subject: [PATCH 20/42] Updated Coreferee Universe entry (#10763) --- website/meta/universe.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 11a174c8c..e37c918ca 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2817,9 +2817,9 @@ "id": "coreferee", "title": "Coreferee", "slogan": "Coreference resolution for multiple languages", - "github": "msg-systems/coreferee", - "url": "https://github.com/msg-systems/coreferee", - "description": "Coreferee is a pipeline plugin that performs coreference resolution for English, German and Polish. It is designed so that it is easy to add support for new languages and optimised for limited training data. It uses a mixture of neural networks and programmed rules. Please note you will need to [install models](https://github.com/msg-systems/coreferee#getting-started) before running the code example.", + "github": "explosion/coreferee", + "url": "https://github.com/explosion/coreferee", + "description": "Coreferee is a pipeline plugin that performs coreference resolution for English, French, German and Polish. It is designed so that it is easy to add support for new languages and optimised for limited training data. It uses a mixture of neural networks and programmed rules. Please note you will need to [install models](https://github.com/explosion/coreferee#getting-started) before running the code example.", "pip": "coreferee", "category": ["pipeline", "models", "standalone"], "tags": ["coreference-resolution", "anaphora"], From e626df959fdcbf7a5fbc9d24a86af8e093238c82 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 6 May 2022 15:40:59 +0200 Subject: [PATCH 21/42] Document different ways to create a pipeline (#10762) * Document different ways to create a pipeline: moved up/slightly modified paragraph on pipeline creation. * Document different ways to create a pipeline: changed Finnish to Ukrainian in example for language without trained pipeline. * Document different ways to create a pipeline: added explanation of blank pipeline. * Document different ways to create a pipeline: exchanged Ukrainian with Yoruba. --- website/docs/usage/models.md | 51 ++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index f82da44d9..56992e7e3 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -27,6 +27,35 @@ import QuickstartModels from 'widgets/quickstart-models.js' +### Usage note + +> If lemmatization rules are available for your language, make sure to install +> spaCy with the `lookups` option, or install +> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) +> separately in the same environment: +> +> ```bash +> $ pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS +> ``` + +If a trained pipeline is available for a language, you can download it using the +[`spacy download`](/api/cli#download) command as shown above. In order to use +languages that don't yet come with a trained pipeline, you have to import them +directly, or use [`spacy.blank`](/api/top-level#spacy.blank): + +```python +from spacy.lang.yo import Yoruba +nlp = Yoruba() # use directly +nlp = spacy.blank("yo") # blank instance +``` + +A blank pipeline is typically just a tokenizer. You might want to create a blank +pipeline when you only need a tokenizer, when you want to add more components +from scratch, or for testing purposes. Initializing the language object directly +yields the same result as generating it using `spacy.blank()`. In both cases the +default configuration for the chosen language is loaded, and no pretrained +components will be available. + ## Language support {#languages} spaCy currently provides support for the following languages. You can help by @@ -37,28 +66,6 @@ contribute to development. Also see the [training documentation](/usage/training) for how to train your own pipelines on your data. -> #### Usage note -> -> If a trained pipeline is available for a language, you can download it using -> the [`spacy download`](/api/cli#download) command. In order to use languages -> that don't yet come with a trained pipeline, you have to import them directly, -> or use [`spacy.blank`](/api/top-level#spacy.blank): -> -> ```python -> from spacy.lang.fi import Finnish -> nlp = Finnish() # use directly -> nlp = spacy.blank("fi") # blank instance -> ``` -> -> If lemmatization rules are available for your language, make sure to install -> spaCy with the `lookups` option, or install -> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) -> separately in the same environment: -> -> ```bash -> $ pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS -> ``` - import Languages from 'widgets/languages.js' From 733114bdd91b707209956e521a6b423f2154ce72 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 9 May 2022 19:44:14 +0200 Subject: [PATCH 22/42] `training.md`: Fix typos (#10775) --- website/docs/usage/training.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index f46f0052b..5e064b269 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -247,7 +247,7 @@ a consistent format. There are no command-line arguments that need to be set, and no hidden defaults. However, there can still be scenarios where you may want to override config settings when you run [`spacy train`](/api/cli#train). This includes **file paths** to vectors or other resources that shouldn't be -hard-code in a config file, or **system-dependent settings**. +hard-coded in a config file, or **system-dependent settings**. For cases like this, you can set additional command-line options starting with `--` that correspond to the config section and value to override. For example, @@ -730,7 +730,7 @@ with the name of the respective [registry](/api/top-level#registry), e.g. `@spacy.registry.architectures`, and a string name to assign to your function. Registering custom functions allows you to **plug in models** defined in PyTorch or TensorFlow, make **custom modifications** to the `nlp` object, create custom -optimizers or schedules, or **stream in data** and preprocesses it on the fly +optimizers or schedules, or **stream in data** and preprocess it on the fly while training. Each custom function can have any number of arguments that are passed in via the From 1543558d0805c78be6f4fac04fb5c764d8daa20f Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 10 May 2022 08:24:42 +0200 Subject: [PATCH 23/42] Add test for old architectures (#10751) * add v1 and v2 tests for tok2vec architectures * textcat architectures are not "layers" * test older textcat architectures * test older parser architecture --- spacy/tests/parser/test_parse.py | 29 ++++++++++ spacy/tests/pipeline/test_textcat.py | 19 ++++++- spacy/tests/pipeline/test_tok2vec.py | 39 +++++++++---- website/docs/api/legacy.md | 85 ++++++++++++++-------------- 4 files changed, 119 insertions(+), 53 deletions(-) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 7bbb30d8e..aaf31ed56 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -12,6 +12,7 @@ from spacy.vocab import Vocab from ...pipeline import DependencyParser from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL from ..util import apply_transition_sequence, make_tempdir +from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL TRAIN_DATA = [ ( @@ -395,6 +396,34 @@ def test_overfitting_IO(pipe_name): assert_equal(batch_deps_1, no_batch_deps) +# fmt: off +@pytest.mark.slow +@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) +@pytest.mark.parametrize( + "parser_config", + [ + # TransitionBasedParser V1 + ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}), + # TransitionBasedParser V2 + ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}), + ], +) +# fmt: on +def test_parser_configs(pipe_name, parser_config): + pipe_config = {"model": parser_config} + nlp = English() + parser = nlp.add_pipe(pipe_name, config=pipe_config) + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for dep in annotations.get("deps", []): + parser.add_label(dep) + optimizer = nlp.initialize() + for i in range(5): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + def test_beam_parser_scores(): # Test that we can get confidence values out of the beam_parser pipe beam_width = 16 diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 798dd165e..0bb036a33 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -382,6 +382,7 @@ def test_implicit_label(name, get_examples): # fmt: off +@pytest.mark.slow @pytest.mark.parametrize( "name,textcat_config", [ @@ -390,7 +391,10 @@ def test_implicit_label(name, get_examples): ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), - # ENSEMBLE + # ENSEMBLE V1 + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}), + # ENSEMBLE V2 ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}}), ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}}), ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}}), @@ -643,15 +647,28 @@ def test_overfitting_IO_multi(): # fmt: off +@pytest.mark.slow @pytest.mark.parametrize( "name,train_data,textcat_config", [ + # BOW V1 + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), + # ENSEMBLE V1 + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}), + # CNN V1 + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + # BOW V2 ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), + # ENSEMBLE V2 ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), + # CNN V2 ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ], diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 37104c78a..64faf133d 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -1,13 +1,13 @@ import pytest from spacy.ml.models.tok2vec import build_Tok2Vec_model -from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed -from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder +from spacy.ml.models.tok2vec import MultiHashEmbed, MaxoutWindowEncoder from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.training import Example from spacy import util from spacy.lang.en import English +from spacy.util import registry from thinc.api import Config, get_current_ops from numpy.testing import assert_array_equal @@ -55,24 +55,41 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): assert doc_vec.shape == (len(doc), width) +@pytest.mark.slow +@pytest.mark.parametrize("width", [8]) @pytest.mark.parametrize( - "width,embed_arch,embed_config,encode_arch,encode_config", + "embed_arch,embed_config", # fmt: off [ - (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), - (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), - (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), + ("spacy.MultiHashEmbed.v1", {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}), + ("spacy.MultiHashEmbed.v1", {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}), + ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}), + ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}), ], # fmt: on ) -def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_config): +@pytest.mark.parametrize( + "tok2vec_arch,encode_arch,encode_config", + # fmt: off + [ + ("spacy.Tok2Vec.v1", "spacy.MaxoutWindowEncoder.v1", {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + ("spacy.Tok2Vec.v2", "spacy.MaxoutWindowEncoder.v2", {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + ("spacy.Tok2Vec.v1", "spacy.MishWindowEncoder.v1", {"window_size": 1, "depth": 6}), + ("spacy.Tok2Vec.v2", "spacy.MishWindowEncoder.v2", {"window_size": 1, "depth": 6}), + ], + # fmt: on +) +def test_tok2vec_configs( + width, tok2vec_arch, embed_arch, embed_config, encode_arch, encode_config +): + embed = registry.get("architectures", embed_arch) + encode = registry.get("architectures", encode_arch) + tok2vec_model = registry.get("architectures", tok2vec_arch) + embed_config["width"] = width encode_config["width"] = width docs = get_batch(3) - tok2vec = build_Tok2Vec_model( - embed_arch(**embed_config), encode_arch(**encode_config) - ) + tok2vec = tok2vec_model(embed(**embed_config), encode(**encode_config)) tok2vec.initialize(docs) vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md index e24c37d77..31d178b67 100644 --- a/website/docs/api/legacy.md +++ b/website/docs/api/legacy.md @@ -103,11 +103,22 @@ and residual connections. | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | | **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ | -### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1} +### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1} -Identical to -[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser) -except the `use_upper` was set to `True` by default. +Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except +using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included. + +### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1} + +Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed) +except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are +included. + +### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1} + +Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed) +except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are +included. ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1} @@ -147,41 +158,6 @@ network has an internal CNN Tok2Vec layer and uses attention. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1} - -Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except -using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included. - -### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1} - -Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed) -except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are -included. - -### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1} - -Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed) -except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are -included. - -## Layers {#layers} - -These functions are available from `@spacy.registry.layers`. - -### spacy.StaticVectors.v1 {#StaticVectors_v1} - -Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except -for the handling of tokens without vectors. - - - -`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the -vectors table, which causes the model predictions to change if new vectors are -added to an existing vectors table. See more details in -[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655). - - - ### spacy.TextCatCNN.v1 {#TextCatCNN_v1} Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means @@ -246,8 +222,35 @@ the others, but may not be as accurate, especially if texts are short. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1} + +Identical to +[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser) +except the `use_upper` was set to `True` by default. + +## Layers {#layers} + +These functions are available from `@spacy.registry.layers`. + +### spacy.StaticVectors.v1 {#StaticVectors_v1} + +Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except +for the handling of tokens without vectors. + + + +`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the +vectors table, which causes the model predictions to change if new vectors are +added to an existing vectors table. See more details in +[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655). + + + ## Loggers {#loggers} -Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`. +Logging utilities for spaCy are implemented in the +[`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the +functions are typically available from `@spacy.registry.loggers`. -More documentation can be found in that repo's [readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file. +More documentation can be found in that repo's +[readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file. From 290435968566d1ce0aff89cecee548b92b657bb2 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 10 May 2022 10:40:11 +0200 Subject: [PATCH 24/42] Allow assets to be optional in spacy project (#10714) * Allow assets to be optional in spacy project: draft for optional flag/download_all options. * Allow assets to be optional in spacy project: added OPTIONAL_DEFAULT reflecting default asset optionality. * Allow assets to be optional in spacy project: renamed --all to --extra. * Allow assets to be optional in spacy project: included optional flag in project config test. * Allow assets to be optional in spacy project: added documentation. * Allow assets to be optional in spacy project: fixing deprecated --all reference. Co-authored-by: Adriane Boyd * Allow assets to be optional in spacy project: fixed project_assets() docstring. * Allow assets to be optional in spacy project: adjusted wording in justification of optional assets. Co-authored-by: Sofie Van Landeghem * Allow assets to be optional in spacy project: switched to as keyword in project.yml. Updated docs. * Allow assets to be optional in spacy project: updated comment. * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in output. Co-authored-by: Sofie Van Landeghem * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in docstring.. Co-authored-by: Sofie Van Landeghem * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in test.. Co-authored-by: Sofie Van Landeghem * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in test. Co-authored-by: Sofie Van Landeghem * Allow assets to be optional in spacy project: renamed OPTIONAL_DEFAULT to EXTRA_DEFAULT. Co-authored-by: Adriane Boyd Co-authored-by: Sofie Van Landeghem --- spacy/cli/project/assets.py | 29 +++++++++++++++++++++++++---- spacy/tests/test_cli.py | 7 +++++++ website/docs/usage/projects.md | 30 ++++++++++++++++++------------ 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 5e0cdfdf2..61438d1a8 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -12,6 +12,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config from .._util import get_checksum, download_file, git_checkout, get_git_version from .._util import SimpleFrozenDict, parse_config_overrides +# Whether assets are extra if `extra` is not set. +EXTRA_DEFAULT = False + @project_cli.command( "assets", @@ -21,7 +24,8 @@ def project_assets_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), - sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.") + sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."), + extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.") # fmt: on ): """Fetch project assets like datasets and pretrained weights. Assets are @@ -32,7 +36,12 @@ def project_assets_cli( DOCS: https://spacy.io/api/cli#project-assets """ overrides = parse_config_overrides(ctx.args) - project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout) + project_assets( + project_dir, + overrides=overrides, + sparse_checkout=sparse_checkout, + extra=extra, + ) def project_assets( @@ -40,17 +49,29 @@ def project_assets( *, overrides: Dict[str, Any] = SimpleFrozenDict(), sparse_checkout: bool = False, + extra: bool = False, ) -> None: """Fetch assets for a project using DVC if possible. project_dir (Path): Path to project directory. + sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files + needed. + extra (bool): Whether to download all assets, including those marked as 'extra'. """ project_path = ensure_path(project_dir) config = load_project_config(project_path, overrides=overrides) - assets = config.get("assets", {}) + assets = [ + asset + for asset in config.get("assets", []) + if extra or not asset.get("extra", EXTRA_DEFAULT) + ] if not assets: - msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) + msg.warn( + f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)", + exits=0, + ) msg.info(f"Fetching {len(assets)} asset(s)") + for asset in assets: dest = (project_dir / asset["dest"]).resolve() checksum = asset.get("checksum") diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 0fa6f5670..3ef56d9f6 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -341,6 +341,7 @@ def test_project_config_validation_full(): "assets": [ { "dest": "x", + "extra": True, "url": "https://example.com", "checksum": "63373dd656daa1fd3043ce166a59474c", }, @@ -352,6 +353,12 @@ def test_project_config_validation_full(): "path": "y", }, }, + { + "dest": "z", + "extra": False, + "url": "https://example.com", + "checksum": "63373dd656daa1fd3043ce166a59474c", + }, ], "commands": [ { diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 57d226913..fb6f05611 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -94,9 +94,8 @@ also use any private repo you have access to with Git. Assets are data files your project needs – for example, the training and evaluation data or pretrained vectors and embeddings to initialize your model with. Each project template comes with a `project.yml` that defines the assets -to download and where to put them. The -[`spacy project assets`](/api/cli#project-assets) will fetch the project assets -for you: +to download and where to put them. The [`spacy project assets`](/api/cli#run) +will fetch the project assets for you: ```cli $ cd some_example_project @@ -108,6 +107,11 @@ even cloud storage such as GCS and S3. You can also fetch assets using git, by replacing the `url` string with a `git` block. spaCy will use Git's "sparse checkout" feature to avoid downloading the whole repository. +Sometimes your project configuration may include large assets that you don't +necessarily want to download when you run `spacy project assets`. That's why +assets can be marked as [`extra`](#data-assets-url) - by default, these assets +are not downloaded. If they should be, run `spacy project assets --extra`. + ### 3. Run a command {#run} > #### project.yml @@ -215,9 +219,9 @@ pipelines. > #### Tip: Multi-line YAML syntax for long values > -> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be -> helpful for readability with longer values such as project descriptions or -> commands that take several arguments. +> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be helpful +> for readability with longer values such as project descriptions or commands +> that take several arguments. ```yaml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml @@ -261,8 +265,9 @@ dependencies to use certain protocols. > - dest: 'assets/training.spacy' > url: 'https://example.com/data.spacy' > checksum: '63373dd656daa1fd3043ce166a59474c' -> # Download from Google Cloud Storage bucket +> # Optional download from Google Cloud Storage bucket > - dest: 'assets/development.spacy' +> extra: True > url: 'gs://your-bucket/corpora' > checksum: '5113dc04e03f079525edd8df3f4f39e3' > ``` @@ -270,6 +275,7 @@ dependencies to use certain protocols. | Name | Description | | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| `extra` | Optional flag determining whether this asset is downloaded only if `spacy project assets` is run with `--extra`. `False` by default. | | `url` | The URL to download from, using the respective protocol. | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | | `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | @@ -294,12 +300,12 @@ files you need and not the whole repo. > description: 'The training data (5000 examples)' > ``` -| Name | Description | -| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | | `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.
`branch`: The branch to download from. Defaults to `"master"`. | -| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | -| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | +| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | +| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | #### Working with private assets {#data-asets-private} From d524f6415f4d20e7549d8a5a81be4fdb4f71c3f0 Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Wed, 11 May 2022 10:15:32 +0200 Subject: [PATCH 25/42] Add documentation tip about overriding variables (#10780) --- website/docs/usage/projects.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index fb6f05611..566ae561b 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -226,12 +226,20 @@ pipelines. ```yaml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` +> #### Tip: Overriding variables on the CLI +> +> If you want to override one or more variables on the CLI and are not already specifying a +> project directory, you need to add `.` as a placeholder: +> +> ``` +> python -m spacy project run test . --vars.foo bar +> ``` | Section | Description | | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | -| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | From b65d652881644f9a62a38d7979aee683853c818a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 12 May 2022 10:06:25 +0200 Subject: [PATCH 26/42] Override SpanGroups.setdefault to provide default SpanGroup (#10772) * Fix mistake in SpanGroup API docs * Restrict SpanGroups.setdefault to SpanGroup only * Refactor to support default span iterable --- spacy/tests/doc/test_doc_api.py | 12 +++++++++++- spacy/tokens/_dict_proxies.py | 9 +++++++++ website/docs/api/spangroup.md | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 19b554572..dd4942989 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -11,7 +11,7 @@ from spacy.lang.en import English from spacy.lang.xx import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme -from spacy.tokens import Doc, Span, Token +from spacy.tokens import Doc, Span, SpanGroup, Token from spacy.vocab import Vocab from .test_underscore import clean_underscore # noqa: F401 @@ -964,3 +964,13 @@ def test_doc_spans_copy(en_tokenizer): assert weakref.ref(doc1) == doc1.spans.doc_ref doc2 = doc1.copy() assert weakref.ref(doc2) == doc2.spans.doc_ref + + +def test_doc_spans_setdefault(en_tokenizer): + doc = en_tokenizer("Some text about Colombia and the Czech Republic") + doc.spans.setdefault("key1") + assert len(doc.spans["key1"]) == 0 + doc.spans.setdefault("key2", default=[doc[0:1]]) + assert len(doc.spans["key2"]) == 1 + doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]])) + assert len(doc.spans["key3"]) == 2 diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index 8643243fa..d9506769b 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -43,6 +43,15 @@ class SpanGroups(UserDict): doc = self._ensure_doc() return SpanGroups(doc).from_bytes(self.to_bytes()) + def setdefault(self, key, default=None): + if not isinstance(default, SpanGroup): + if default is None: + spans = [] + else: + spans = default + default = self._make_span_group(key, spans) + return super().setdefault(key, default=default) + def to_bytes(self) -> bytes: # We don't need to serialize this as a dict, because the groups # know their names. diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md index 1e2d18a82..8dbdefc01 100644 --- a/website/docs/api/spangroup.md +++ b/website/docs/api/spangroup.md @@ -233,7 +233,7 @@ group. > doc.spans["errors"] = [] > doc.spans["errors"].extend([doc[1:3], doc[0:1]]) > assert len(doc.spans["errors"]) == 2 -> span_group = SpanGroup([doc[1:4], doc[0:3]) +> span_group = SpanGroup(doc, spans=[doc[1:4], doc[0:3]]) > doc.spans["errors"].extend(span_group) > ``` From 6f9e2ca81ff541a0ab53b8d77100adaac8699219 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 12 May 2022 11:46:08 +0200 Subject: [PATCH 27/42] Ignore overrides for pipe names in config argument (#10779) * Pipe name override in config: added check with warning, added removal of name override from config, extended tests. * Pipoe name override in config: added pytest UserWarning. Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd --- spacy/errors.py | 1 + spacy/language.py | 3 +++ spacy/tests/pipeline/test_pipe_factories.py | 5 ++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index b01afcb80..bff8e7414 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -199,6 +199,7 @@ class Warnings(metaclass=ErrorsWithCodes): W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation " "for the corpora used to train the language. Please check " "`nlp.meta[\"sources\"]` for any relevant links.") + W119 = ("Overriding pipe name in `config` is not supported. Ignoring override '{name_in_config}'.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/language.py b/spacy/language.py index bab403f0e..faca1f258 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -774,6 +774,9 @@ class Language: name = name if name is not None else factory_name if name in self.component_names: raise ValueError(Errors.E007.format(name=name, opts=self.component_names)) + # Overriding pipe name in the config is not supported and will be ignored. + if "name" in config: + warnings.warn(Warnings.W119.format(name_in_config=config.pop("name"))) if source is not None: # We're loading the component from a model. After loading the # component, we know its real factory name diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 4128e2a48..4340a4b88 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -119,6 +119,7 @@ def test_pipe_class_component_config(): self.value1 = value1 self.value2 = value2 self.is_base = True + self.name = name def __call__(self, doc: Doc) -> Doc: return doc @@ -141,12 +142,14 @@ def test_pipe_class_component_config(): nlp.add_pipe(name) with pytest.raises(ConfigValidationError): # invalid config nlp.add_pipe(name, config={"value1": "10", "value2": "hello"}) - nlp.add_pipe(name, config={"value1": 10, "value2": "hello"}) + with pytest.warns(UserWarning): + nlp.add_pipe(name, config={"value1": 10, "value2": "hello", "name": "wrong_name"}) pipe = nlp.get_pipe(name) assert isinstance(pipe.nlp, Language) assert pipe.value1 == 10 assert pipe.value2 == "hello" assert pipe.is_base is True + assert pipe.name == name nlp_en = English() with pytest.raises(ConfigValidationError): # invalid config From cb06309ed81262b35def950b9978650c60ddab31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20D=C3=BCggelin?= Date: Thu, 12 May 2022 12:23:52 +0200 Subject: [PATCH 28/42] Fix PhraseMatcher remove overlapping terms (#10734) * Add regression test for issue 10643 * Improve overlapping terms testcase * Fix removing overlapping terms in phrase matcher (#10643) --- spacy/matcher/phrasematcher.pyx | 2 ++ spacy/tests/matcher/test_phrase_matcher.py | 30 ++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 2ff5105ad..382029872 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -118,6 +118,8 @@ cdef class PhraseMatcher: # if token is not found, break out of the loop current_node = NULL break + path_nodes.push_back(current_node) + path_keys.push_back(self._terminal_hash) # remove the tokens from trie node if there are no other # keywords with them result = map_get(current_node, self._terminal_hash) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index f893d81f8..3b24f3ba8 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -122,6 +122,36 @@ def test_issue6839(en_vocab): assert matches +@pytest.mark.issue(10643) +def test_issue10643(en_vocab): + """Ensure overlapping terms can be removed from PhraseMatcher""" + + # fmt: off + words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."] + # fmt: on + doc = Doc(en_vocab, words=words) + terms = { + "0": Doc(en_vocab, words=["binary"]), + "1": Doc(en_vocab, words=["binary", "data"]), + } + matcher = PhraseMatcher(en_vocab) + for match_id, term in terms.items(): + matcher.add(match_id, [term]) + + matches = matcher(doc) + assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)] + + matcher.remove("0") + assert len(matcher) == 1 + new_matches = matcher(doc) + assert new_matches == [(en_vocab.strings["1"], 4, 6)] + + matcher.remove("1") + assert len(matcher) == 0 + no_matches = matcher(doc) + assert not no_matches + + def test_matcher_phrase_matcher(en_vocab): doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) # intermediate phrase From f5952c085193c3c064ec005ca548f598e17830bb Mon Sep 17 00:00:00 2001 From: schaeran Date: Thu, 12 May 2022 18:23:00 +0200 Subject: [PATCH 29/42] update spaCy Universe: spacytextblob (code example) --- website/meta/universe.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index e37c918ca..e91e9ef44 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -298,6 +298,10 @@ "github": "SamEdwardes/spacytextblob", "pip": "spacytextblob", "code_example": [ + "# the following installations are required", + "# python -m textblob.download_corpora", + "# python -m spacy download en_core_web_sm", + "", "import spacy", "from spacytextblob.spacytextblob import SpacyTextBlob", "", From fd36469900da4c34f3d7b6c5400f05d8df73db8d Mon Sep 17 00:00:00 2001 From: kadarakos Date: Fri, 13 May 2022 11:41:32 +0200 Subject: [PATCH 30/42] bugfix parser labels (#10797) --- spacy/tests/training/test_rehearse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/training/test_rehearse.py b/spacy/tests/training/test_rehearse.py index 84c507702..5ac7fc217 100644 --- a/spacy/tests/training/test_rehearse.py +++ b/spacy/tests/training/test_rehearse.py @@ -181,7 +181,7 @@ def _optimize(nlp, component: str, data: List, rehearse: bool): elif component == "tagger": _add_tagger_label(pipe, data) elif component == "parser": - _add_tagger_label(pipe, data) + _add_parser_label(pipe, data) elif component == "textcat_multilabel": _add_textcat_label(pipe, data) else: From 99aeaf9bd3c3ddd3fb0440e44b14ae187b6d8d8e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 13 May 2022 19:02:08 +0200 Subject: [PATCH 31/42] Auto-format code with black (#10795) Co-authored-by: explosion-bot --- spacy/tests/pipeline/test_pipe_factories.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 4340a4b88..232b0512e 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -143,7 +143,9 @@ def test_pipe_class_component_config(): with pytest.raises(ConfigValidationError): # invalid config nlp.add_pipe(name, config={"value1": "10", "value2": "hello"}) with pytest.warns(UserWarning): - nlp.add_pipe(name, config={"value1": 10, "value2": "hello", "name": "wrong_name"}) + nlp.add_pipe( + name, config={"value1": 10, "value2": "hello", "name": "wrong_name"} + ) pipe = nlp.get_pipe(name) assert isinstance(pipe.nlp, Language) assert pipe.value1 == 10 From 357be2614effdfdff858eca0eb5f17e167ceb265 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 17 May 2022 10:23:16 +0200 Subject: [PATCH 32/42] Fuzz tokenizer.explain: draft for fuzzy tests. (#10771) * Fuzz tokenizer.explain: draft for fuzzy tests. * Fuzz tokenizer.explain: xignoring tokenizer.explain() tests. Removed deadline modification. Removed LANGUAGES_WITHOUT_TOKENIZERS. * Fuzz tokenizer.explain: changed tokenizer initialization to avoid failus in Azure runs. * Fuzz tokenizer.explain: type hint for tokenizer in test. Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd --- spacy/tests/tokenizer/test_explain.py | 53 ++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 0a10ae67d..5b4eeca16 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -1,7 +1,13 @@ -import pytest import re -from spacy.util import get_lang_class +import string + +import hypothesis +import hypothesis.strategies +import pytest + +import spacy from spacy.tokenizer import Tokenizer +from spacy.util import get_lang_class # Only include languages with no external dependencies # "is" seems to confuse importlib, so we're also excluding it for now @@ -77,3 +83,46 @@ def test_tokenizer_explain_special_matcher(en_vocab): tokens = [t.text for t in tokenizer("a/a.")] explain_tokens = [t[1] for t in tokenizer.explain("a/a.")] assert tokens == explain_tokens + + +@hypothesis.strategies.composite +def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str: + """ + Composite strategy for fuzzily generating sentence with varying interpunctation. + + draw (hypothesis.strategies.DrawFn): Protocol for drawing function allowing to fuzzily pick from hypothesis' + strategies. + max_n_words (int): Max. number of words in generated sentence. + RETURNS (str): Fuzzily generated sentence. + """ + + punctuation_and_space_regex = "|".join( + [*[re.escape(p) for p in string.punctuation], r"\s"] + ) + sentence = [ + [ + draw(hypothesis.strategies.text(min_size=1)), + draw(hypothesis.strategies.from_regex(punctuation_and_space_regex)), + ] + for _ in range( + draw(hypothesis.strategies.integers(min_value=2, max_value=max_n_words)) + ) + ] + + return " ".join([token for token_pair in sentence for token in token_pair]) + + +@pytest.mark.xfail +@pytest.mark.parametrize("lang", LANGUAGES) +@hypothesis.given(sentence=sentence_strategy()) +def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None: + """ + Tests whether output of tokenizer.explain() matches tokenizer output. Input generated by hypothesis. + lang (str): Language to test. + text (str): Fuzzily generated sentence to tokenize. + """ + + tokenizer: Tokenizer = spacy.blank(lang).tokenizer + tokens = [t.text for t in tokenizer(sentence) if not t.is_space] + debug_tokens = [t[1] for t in tokenizer.explain(sentence)] + assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}" From 46982cf694bd70a53965bc02c0f5be8203f7d526 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 20 May 2022 16:56:32 +0900 Subject: [PATCH 33/42] Add glossary entry for root (#10821) * Add glossary entry for root There was already one but it was lower case, maybe that should be removed? * remove lowercase root On reflection, that was probably just a mistake. * Add lowercase root back It's harmless to leave it there. --- spacy/glossary.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/glossary.py b/spacy/glossary.py index 25c00d3ed..d2240fbba 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -273,6 +273,7 @@ GLOSSARY = { "relcl": "relative clause modifier", "reparandum": "overridden disfluency", "root": "root", + "ROOT": "root", "vocative": "vocative", "xcomp": "open clausal complement", # Dependency labels (German) From a82ec56aae830c1ef4b57af9c0faec1aa7170789 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 20 May 2022 09:57:41 +0200 Subject: [PATCH 34/42] Remove cuda extras for non-linux arm in install widget (#10796) * Remove cuda extras for non-linux arm platforms in install widget * Extend cuda versions install widget * Update GPU install docs to clarify cuda --- website/docs/usage/index.md | 11 +++++------ website/src/widgets/quickstart-install.js | 4 +++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 54ab62467..d2aa08d73 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -129,15 +129,14 @@ machine learning library, [Thinc](https://thinc.ai). For GPU support, we've been grateful to use the work of Chainer's [CuPy](https://cupy.chainer.org) module, which provides a numpy-compatible interface for GPU arrays. -spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`, -`spacy[cuda91]`, `spacy[cuda92]`, `spacy[cuda100]`, `spacy[cuda101]`, -`spacy[cuda102]`, `spacy[cuda110]`, `spacy[cuda111]` or `spacy[cuda112]`. If you -know your cuda version, using the more explicit specifier allows cupy to be -installed via wheel, saving some compilation time. The specifiers should install +spaCy can be installed for a CUDA-compatible GPU by specifying `spacy[cuda]`, +`spacy[cuda102]`, `spacy[cuda112]`, `spacy[cuda113]`, etc. If you know your +CUDA version, using the more explicit specifier allows CuPy to be installed via +wheel, saving some compilation time. The specifiers should install [`cupy`](https://cupy.chainer.org). ```bash -$ pip install -U %%SPACY_PKG_NAME[cuda92]%%SPACY_PKG_FLAGS +$ pip install -U %%SPACY_PKG_NAME[cuda113]%%SPACY_PKG_FLAGS ``` Once you have a GPU-enabled installation, the best way to activate it is to call diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index fbf043c7d..926d76ae3 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -23,6 +23,8 @@ const CUDA = { '11.2': 'cuda112', '11.3': 'cuda113', '11.4': 'cuda114', + '11.5': 'cuda115', + '11.6': 'cuda116', } const LANG_EXTRAS = ['ja'] // only for languages with models @@ -48,7 +50,7 @@ const QuickstartInstall = ({ id, title }) => { const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : [] const apple = os === 'mac' && platform === 'arm' const pipExtras = [ - hardware === 'gpu' && cuda, + (hardware === 'gpu' && (platform !== 'arm' || os === 'linux')) && cuda, train && 'transformers', train && 'lookups', apple && 'apple', From 4fb1809c72f0593af7fab946c7b0a4b367192d33 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Fri, 20 May 2022 15:46:30 +0200 Subject: [PATCH 35/42] Disable weekly GPU/slow tests on forks (#10831) --- .github/workflows/gputests.yml | 1 + .github/workflows/slowtests.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml index bb7f51d29..66e0707e0 100644 --- a/.github/workflows/gputests.yml +++ b/.github/workflows/gputests.yml @@ -10,6 +10,7 @@ jobs: fail-fast: false matrix: branch: [master, v4] + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - name: Trigger buildkite build diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 1a99c751c..38ceb18c6 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -10,6 +10,7 @@ jobs: fail-fast: false matrix: branch: [master, v4] + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - name: Checkout From a3814ee7392aa12396d77e4e036ec858b1d7edb3 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Mon, 23 May 2022 09:15:51 +0200 Subject: [PATCH 36/42] oov confusion fix (#10828) --- website/docs/models/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 9ee96528e..203555651 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -115,7 +115,7 @@ The Finnish, Korean and Swedish `md` and `lg` pipelines use running a trained pipeline on texts and working with [`Doc`](/api/doc) objects, you shouldn't notice any difference with floret vectors. With floret vectors no tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will -return `True` for all tokens. +return `False` for all tokens. If you access vectors directly for similarity comparisons, there are a few differences because floret vectors don't include a fixed word list like the From 7ce3460b23ea7fb7cf9fc905a4e68d71057443e6 Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Mon, 23 May 2022 03:16:31 -0400 Subject: [PATCH 37/42] add floret to static vectors docs (#10833) --- website/docs/usage/embeddings-transformers.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 70fa95099..a487371de 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -530,7 +530,8 @@ models, which can **improve the accuracy** of your components. Word vectors in spaCy are "static" in the sense that they are not learned parameters of the statistical models, and spaCy itself does not feature any algorithms for learning word vector tables. You can train a word vectors table -using tools such as [Gensim](https://radimrehurek.com/gensim/), +using tools such as [floret](https://github.com/explosion/floret), +[Gensim](https://radimrehurek.com/gensim/), [FastText](https://fasttext.cc/) or [GloVe](https://nlp.stanford.edu/projects/glove/), or download existing pretrained vectors. The [`init vectors`](/api/cli#init-vectors) command lets you From 1d34aa2b3dd1ba0931dcb1863dfbeba6ae5b912d Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Tue, 24 May 2022 01:06:38 +0800 Subject: [PATCH 38/42] Add spacy-span-analyzer to debug data (#10668) * Rename to spans_key for consistency * Implement spans length in debug data * Implement how span bounds and spans are obtained In this commit, I implemented how span boundaries (the tokens) around a given span and spans are obtained. I've put them in the compile_gold() function so that it's accessible later on. I will do the actual computation of the span and boundary distinctiveness in the main function above. * Compute for p_spans and p_bounds * Add computation for SD and BD * Fix mypy issues * Add weighted average computation * Fix compile_gold conditional logic * Add test for frequency distribution computation * Add tests for kl-divergence computation * Fix weighted average computation * Make tables more compact by rounding them * Add more descriptive checks for spans * Modularize span computation methods In this commit, I added the _get_span_characteristics and _print_span_characteristics functions so that they can be reusable anywhere. * Remove unnecessary arguments and make fxs more compact * Update a few parameter arguments * Add tests for print_span and get_span methods * Update API to talk about span characteristics in brief * Add better reporting of spans_length * Add test for span length reporting * Update formatting of span length report Removed '' to indicate that it's not a string, then sort the n-grams by their length, not by their frequency. * Apply suggestions from code review Co-authored-by: Adriane Boyd * Show all frequency distribution when -V In this commit, I displayed the full frequency distribution of the span lengths when --verbose is passed. To make things simpler, I rewrote some of the formatter functions so that I can call them whenever. Another notable change is that instead of showing percentages as Integers, I showed them as floats (max 2-decimal places). I did this because it looks weird when it displays (0%). * Update logic on how total is computed The way the 90% thresholding is computed now is that we keep adding the percentages until we reach >= 90%. I also updated the wording and used the term "At least" to denote that >= 90% of your spans have these distributions. * Fix display when showing the threshold percentage * Apply suggestions from code review Co-authored-by: Adriane Boyd * Add better phrasing for span information * Update spacy/cli/debug_data.py Co-authored-by: Adriane Boyd * Add minor edits for whitespaces etc. Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd --- spacy/cli/debug_data.py | 293 +++++++++++++++++++++++++++++++++++++++- spacy/tests/test_cli.py | 115 ++++++++++++++++ website/docs/api/cli.md | 12 ++ 3 files changed, 415 insertions(+), 5 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index f94319d1d..0061515c6 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -6,6 +6,7 @@ import sys import srsly from wasabi import Printer, MESSAGES, msg import typer +import math from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli @@ -30,6 +31,12 @@ DEP_LABEL_THRESHOLD = 20 # Minimum number of expected examples to train a new pipeline BLANK_MODEL_MIN_THRESHOLD = 100 BLANK_MODEL_THRESHOLD = 2000 +# Arbitrary threshold where SpanCat performs well +SPAN_DISTINCT_THRESHOLD = 1 +# Arbitrary threshold where SpanCat performs well +BOUNDARY_DISTINCT_THRESHOLD = 1 +# Arbitrary threshold for filtering span lengths during reporting (percentage) +SPAN_LENGTH_THRESHOLD_PERCENTAGE = 90 @debug_cli.command( @@ -247,6 +254,69 @@ def debug_data( msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True + with msg.loading("Obtaining span characteristics..."): + span_characteristics = _get_span_characteristics( + train_dataset, gold_train_data, spans_key + ) + + msg.info(f"Span characteristics for spans_key '{spans_key}'") + msg.info("SD = Span Distinctiveness, BD = Boundary Distinctiveness") + _print_span_characteristics(span_characteristics) + + _span_freqs = _get_spans_length_freq_dist( + gold_train_data["spans_length"][spans_key] + ) + _filtered_span_freqs = _filter_spans_length_freq_dist( + _span_freqs, threshold=SPAN_LENGTH_THRESHOLD_PERCENTAGE + ) + + msg.info( + f"Over {SPAN_LENGTH_THRESHOLD_PERCENTAGE}% of spans have lengths of 1 -- " + f"{max(_filtered_span_freqs.keys())} " + f"(min={span_characteristics['min_length']}, max={span_characteristics['max_length']}). " + f"The most common span lengths are: {_format_freqs(_filtered_span_freqs)}. " + "If you are using the n-gram suggester, note that omitting " + "infrequent n-gram lengths can greatly improve speed and " + "memory usage." + ) + + msg.text( + f"Full distribution of span lengths: {_format_freqs(_span_freqs)}", + show=verbose, + ) + + # Add report regarding span characteristics + if span_characteristics["avg_sd"] < SPAN_DISTINCT_THRESHOLD: + msg.warn("Spans may not be distinct from the rest of the corpus") + else: + msg.good("Spans are distinct from the rest of the corpus") + + p_spans = span_characteristics["p_spans"].values() + all_span_tokens: Counter = sum(p_spans, Counter()) + most_common_spans = [w for w, _ in all_span_tokens.most_common(10)] + msg.text( + "10 most common span tokens: {}".format( + _format_labels(most_common_spans) + ), + show=verbose, + ) + + # Add report regarding span boundary characteristics + if span_characteristics["avg_bd"] < BOUNDARY_DISTINCT_THRESHOLD: + msg.warn("Boundary tokens are not distinct from the rest of the corpus") + else: + msg.good("Boundary tokens are distinct from the rest of the corpus") + + p_bounds = span_characteristics["p_bounds"].values() + all_span_bound_tokens: Counter = sum(p_bounds, Counter()) + most_common_bounds = [w for w, _ in all_span_bound_tokens.most_common(10)] + msg.text( + "10 most common span boundary tokens: {}".format( + _format_labels(most_common_bounds) + ), + show=verbose, + ) + if has_low_data_warning: msg.text( f"To train a new span type, your data should include at " @@ -647,6 +717,9 @@ def _compile_gold( "words": Counter(), "roots": Counter(), "spancat": dict(), + "spans_length": dict(), + "spans_per_type": dict(), + "sb_per_type": dict(), "ws_ents": 0, "boundary_cross_ents": 0, "n_words": 0, @@ -692,14 +765,59 @@ def _compile_gold( elif label == "-": data["ner"]["-"] += 1 if "spancat" in factory_names: - for span_key in list(eg.reference.spans.keys()): - if span_key not in data["spancat"]: - data["spancat"][span_key] = Counter() - for i, span in enumerate(eg.reference.spans[span_key]): + for spans_key in list(eg.reference.spans.keys()): + # Obtain the span frequency + if spans_key not in data["spancat"]: + data["spancat"][spans_key] = Counter() + for i, span in enumerate(eg.reference.spans[spans_key]): if span.label_ is None: continue else: - data["spancat"][span_key][span.label_] += 1 + data["spancat"][spans_key][span.label_] += 1 + + # Obtain the span length + if spans_key not in data["spans_length"]: + data["spans_length"][spans_key] = dict() + for span in gold.spans[spans_key]: + if span.label_ is None: + continue + if span.label_ not in data["spans_length"][spans_key]: + data["spans_length"][spans_key][span.label_] = [] + data["spans_length"][spans_key][span.label_].append(len(span)) + + # Obtain spans per span type + if spans_key not in data["spans_per_type"]: + data["spans_per_type"][spans_key] = dict() + for span in gold.spans[spans_key]: + if span.label_ not in data["spans_per_type"][spans_key]: + data["spans_per_type"][spans_key][span.label_] = [] + data["spans_per_type"][spans_key][span.label_].append(span) + + # Obtain boundary tokens per span type + window_size = 1 + if spans_key not in data["sb_per_type"]: + data["sb_per_type"][spans_key] = dict() + for span in gold.spans[spans_key]: + if span.label_ not in data["sb_per_type"][spans_key]: + # Creating a data structure that holds the start and + # end tokens for each span type + data["sb_per_type"][spans_key][span.label_] = { + "start": [], + "end": [], + } + for offset in range(window_size): + sb_start_idx = span.start - (offset + 1) + if sb_start_idx >= 0: + data["sb_per_type"][spans_key][span.label_]["start"].append( + gold[sb_start_idx : sb_start_idx + 1] + ) + + sb_end_idx = span.end + (offset + 1) + if sb_end_idx <= len(gold): + data["sb_per_type"][spans_key][span.label_]["end"].append( + gold[sb_end_idx - 1 : sb_end_idx] + ) + if "textcat" in factory_names or "textcat_multilabel" in factory_names: data["cats"].update(gold.cats) if any(val not in (0, 1) for val in gold.cats.values()): @@ -770,6 +888,16 @@ def _format_labels( return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)]) +def _format_freqs(freqs: Dict[int, float], sort: bool = True) -> str: + if sort: + freqs = dict(sorted(freqs.items())) + + _freqs = [(str(k), v) for k, v in freqs.items()] + return ", ".join( + [f"{l} ({c}%)" for l, c in cast(Iterable[Tuple[str, float]], _freqs)] + ) + + def _get_examples_without_label( data: Sequence[Example], label: str, @@ -824,3 +952,158 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]: labels[pipe.key] = set() labels[pipe.key].update(pipe.labels) return labels + + +def _gmean(l: List) -> float: + """Compute geometric mean of a list""" + return math.exp(math.fsum(math.log(i) for i in l) / len(l)) + + +def _wgt_average(metric: Dict[str, float], frequencies: Counter) -> float: + total = sum(value * frequencies[span_type] for span_type, value in metric.items()) + return total / sum(frequencies.values()) + + +def _get_distribution(docs, normalize: bool = True) -> Counter: + """Get the frequency distribution given a set of Docs""" + word_counts: Counter = Counter() + for doc in docs: + for token in doc: + # Normalize the text + t = token.text.lower().replace("``", '"').replace("''", '"') + word_counts[t] += 1 + if normalize: + total = sum(word_counts.values(), 0.0) + word_counts = Counter({k: v / total for k, v in word_counts.items()}) + return word_counts + + +def _get_kl_divergence(p: Counter, q: Counter) -> float: + """Compute the Kullback-Leibler divergence from two frequency distributions""" + total = 0.0 + for word, p_word in p.items(): + total += p_word * math.log(p_word / q[word]) + return total + + +def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]: + """Compile into one list for easier reporting""" + d = { + label: [label] + list(round(d[label], 2) for d in span_data) for label in labels + } + return list(d.values()) + + +def _get_span_characteristics( + examples: List[Example], compiled_gold: Dict[str, Any], spans_key: str +) -> Dict[str, Any]: + """Obtain all span characteristics""" + data_labels = compiled_gold["spancat"][spans_key] + # Get lengths + span_length = { + label: _gmean(l) + for label, l in compiled_gold["spans_length"][spans_key].items() + } + min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()] + max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()] + + # Get relevant distributions: corpus, spans, span boundaries + p_corpus = _get_distribution([eg.reference for eg in examples], normalize=True) + p_spans = { + label: _get_distribution(spans, normalize=True) + for label, spans in compiled_gold["spans_per_type"][spans_key].items() + } + p_bounds = { + label: _get_distribution(sb["start"] + sb["end"], normalize=True) + for label, sb in compiled_gold["sb_per_type"][spans_key].items() + } + + # Compute for actual span characteristics + span_distinctiveness = { + label: _get_kl_divergence(freq_dist, p_corpus) + for label, freq_dist in p_spans.items() + } + sb_distinctiveness = { + label: _get_kl_divergence(freq_dist, p_corpus) + for label, freq_dist in p_bounds.items() + } + + return { + "sd": span_distinctiveness, + "bd": sb_distinctiveness, + "lengths": span_length, + "min_length": min(min_lengths), + "max_length": max(max_lengths), + "avg_sd": _wgt_average(span_distinctiveness, data_labels), + "avg_bd": _wgt_average(sb_distinctiveness, data_labels), + "avg_length": _wgt_average(span_length, data_labels), + "labels": list(data_labels.keys()), + "p_spans": p_spans, + "p_bounds": p_bounds, + } + + +def _print_span_characteristics(span_characteristics: Dict[str, Any]): + """Print all span characteristics into a table""" + headers = ("Span Type", "Length", "SD", "BD") + # Prepare table data with all span characteristics + table_data = [ + span_characteristics["lengths"], + span_characteristics["sd"], + span_characteristics["bd"], + ] + table = _format_span_row( + span_data=table_data, labels=span_characteristics["labels"] + ) + # Prepare table footer with weighted averages + footer_data = [ + span_characteristics["avg_length"], + span_characteristics["avg_sd"], + span_characteristics["avg_bd"], + ] + footer = ["Wgt. Average"] + [str(round(f, 2)) for f in footer_data] + msg.table(table, footer=footer, header=headers, divider=True) + + +def _get_spans_length_freq_dist( + length_dict: Dict, threshold=SPAN_LENGTH_THRESHOLD_PERCENTAGE +) -> Dict[int, float]: + """Get frequency distribution of spans length under a certain threshold""" + all_span_lengths = [] + for _, lengths in length_dict.items(): + all_span_lengths.extend(lengths) + + freq_dist: Counter = Counter() + for i in all_span_lengths: + if freq_dist.get(i): + freq_dist[i] += 1 + else: + freq_dist[i] = 1 + + # We will be working with percentages instead of raw counts + freq_dist_percentage = {} + for span_length, count in freq_dist.most_common(): + percentage = (count / len(all_span_lengths)) * 100.0 + percentage = round(percentage, 2) + freq_dist_percentage[span_length] = percentage + + return freq_dist_percentage + + +def _filter_spans_length_freq_dist( + freq_dist: Dict[int, float], threshold: int +) -> Dict[int, float]: + """Filter frequency distribution with respect to a threshold + + We're going to filter all the span lengths that fall + around a percentage threshold when summed. + """ + total = 0.0 + filtered_freq_dist = {} + for span_length, dist in freq_dist.items(): + if total >= threshold: + break + else: + filtered_freq_dist[span_length] = dist + total += dist + return filtered_freq_dist diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 3ef56d9f6..838e00369 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,4 +1,7 @@ import os +import math +from random import sample +from typing import Counter import pytest import srsly @@ -14,6 +17,10 @@ from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat +from spacy.cli.debug_data import _get_distribution, _get_kl_divergence +from spacy.cli.debug_data import _get_span_characteristics +from spacy.cli.debug_data import _print_span_characteristics +from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies @@ -24,6 +31,7 @@ from spacy.lang.nl import Dutch from spacy.language import Language from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.tokens import Doc +from spacy.tokens.span import Span from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs @@ -740,3 +748,110 @@ def test_debug_data_compile_gold(): eg = Example(pred, ref) data = _compile_gold([eg], ["ner"], nlp, True) assert data["boundary_cross_ents"] == 1 + + +def test_debug_data_compile_gold_for_spans(): + nlp = English() + spans_key = "sc" + + pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) + pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")] + ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) + ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] + eg = Example(pred, ref) + + data = _compile_gold([eg], ["spancat"], nlp, True) + + assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1}) + assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]} + assert data["spans_per_type"][spans_key] == { + "ORG": [Span(ref, 3, 6, "ORG")], + "GPE": [Span(ref, 5, 6, "GPE")], + } + assert data["sb_per_type"][spans_key] == { + "ORG": {"start": [ref[2:3]], "end": [ref[6:7]]}, + "GPE": {"start": [ref[4:5]], "end": [ref[6:7]]}, + } + + +def test_frequency_distribution_is_correct(): + nlp = English() + docs = [ + Doc(nlp.vocab, words=["Bank", "of", "China"]), + Doc(nlp.vocab, words=["China"]), + ] + + expected = Counter({"china": 0.5, "bank": 0.25, "of": 0.25}) + freq_distribution = _get_distribution(docs, normalize=True) + assert freq_distribution == expected + + +def test_kl_divergence_computation_is_correct(): + p = Counter({"a": 0.5, "b": 0.25}) + q = Counter({"a": 0.25, "b": 0.50, "c": 0.15, "d": 0.10}) + result = _get_kl_divergence(p, q) + expected = 0.1733 + assert math.isclose(result, expected, rel_tol=1e-3) + + +def test_get_span_characteristics_return_value(): + nlp = English() + spans_key = "sc" + + pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) + pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")] + ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) + ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] + eg = Example(pred, ref) + + examples = [eg] + data = _compile_gold(examples, ["spancat"], nlp, True) + span_characteristics = _get_span_characteristics( + examples=examples, compiled_gold=data, spans_key=spans_key + ) + + assert {"sd", "bd", "lengths"}.issubset(span_characteristics.keys()) + assert span_characteristics["min_length"] == 1 + assert span_characteristics["max_length"] == 3 + + +def test_ensure_print_span_characteristics_wont_fail(): + """Test if interface between two methods aren't destroyed if refactored""" + nlp = English() + spans_key = "sc" + + pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) + pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")] + ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) + ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] + eg = Example(pred, ref) + + examples = [eg] + data = _compile_gold(examples, ["spancat"], nlp, True) + span_characteristics = _get_span_characteristics( + examples=examples, compiled_gold=data, spans_key=spans_key + ) + _print_span_characteristics(span_characteristics) + + +@pytest.mark.parametrize("threshold", [70, 80, 85, 90, 95]) +def test_span_length_freq_dist_threshold_must_be_correct(threshold): + sample_span_lengths = { + "span_type_1": [1, 4, 4, 5], + "span_type_2": [5, 3, 3, 2], + "span_type_3": [3, 1, 3, 3], + } + span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) + assert sum(span_freqs.values()) >= threshold + + +def test_span_length_freq_dist_output_must_be_correct(): + sample_span_lengths = { + "span_type_1": [1, 4, 4, 5], + "span_type_2": [5, 3, 3, 2], + "span_type_3": [3, 1, 3, 3], + } + threshold = 90 + span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) + assert sum(span_freqs.values()) >= threshold + assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e801ff0a6..dd396d0b3 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -466,6 +466,18 @@ takes the same arguments as `train` and reads settings off the + + +If your pipeline contains a `spancat` component, then this command will also +report span characteristics such as the average span length and the span (or +span boundary) distinctiveness. The distinctiveness measure shows how different +the tokens are with respect to the rest of the corpus using the KL-divergence of +the token distributions. To learn more, you can check out Papay et al.'s work on +[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP +2020)](https://aclanthology.org/2020.emnlp-main.396/). + + + ```cli $ python -m spacy debug data [config_path] [--code] [--ignore-warnings] [--verbose] [--no-format] [overrides] ``` From 6be09bbd07243b546ed52d5a03cdf54c8e028566 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 24 May 2022 03:42:26 +0900 Subject: [PATCH 39/42] Fix Entity Linker with tokenization mismatches (fix #9575) (#10457) * Add failing test * Partial fix for issue This kind of works. The issue with token length mismatches is gone. The problem is that when you get empty lists of encodings to compare, it fails because the sizes are not the same, even though they're both zero: (0, 3) vs (0,). Not sure why that happens... * Short circuit on empties * Remove spurious check The check here isn't needed now the the short circuit is fixed. * Update spacy/tests/pipeline/test_entity_linker.py Co-authored-by: Sofie Van Landeghem * Use "eg", not "example" Co-authored-by: Sofie Van Landeghem --- spacy/pipeline/entity_linker.py | 18 ++++++++----- spacy/tests/pipeline/test_entity_linker.py | 31 +++++++++++++++++++++- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 89e7576bf..12c3e382f 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -234,10 +234,11 @@ class EntityLinker(TrainablePipe): nO = self.kb.entity_vector_length doc_sample = [] vector_sample = [] - for example in islice(get_examples(), 10): - doc = example.x + for eg in islice(get_examples(), 10): + doc = eg.x if self.use_gold_ents: - doc.ents = example.y.ents + ents, _ = eg.get_aligned_ents_and_ner() + doc.ents = ents doc_sample.append(doc) vector_sample.append(self.model.ops.alloc1f(nO)) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) @@ -312,7 +313,8 @@ class EntityLinker(TrainablePipe): for doc, ex in zip(docs, examples): if self.use_gold_ents: - doc.ents = ex.reference.ents + ents, _ = ex.get_aligned_ents_and_ner() + doc.ents = ents else: # only keep matching ents doc.ents = ex.get_matching_ents() @@ -345,7 +347,7 @@ class EntityLinker(TrainablePipe): for eg in examples: kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.reference.ents: + for ent in eg.get_matching_ents(): kb_id = kb_ids[ent.start] if kb_id: entity_encoding = self.kb.get_vector(kb_id) @@ -356,7 +358,11 @@ class EntityLinker(TrainablePipe): entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") selected_encodings = sentence_encodings[keep_ents] - # If the entity encodings list is empty, then + # if there are no matches, short circuit + if not keep_ents: + out = self.model.ops.alloc2f(*sentence_encodings.shape) + return 0, out + if selected_encodings.shape != entity_encodings.shape: err = Errors.E147.format( method="get_loss", msg="gold entities do not match up" diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 83d5bf0e2..ccf26f890 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -14,7 +14,7 @@ from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tests.util import make_tempdir -from spacy.tokens import Span +from spacy.tokens import Span, Doc from spacy.training import Example from spacy.util import ensure_path from spacy.vocab import Vocab @@ -1075,3 +1075,32 @@ def test_no_gold_ents(patterns): # this will run the pipeline on the examples and shouldn't crash results = nlp.evaluate(train_examples) + +@pytest.mark.issue(9575) +def test_tokenization_mismatch(): + nlp = English() + # include a matching entity so that update isn't skipped + doc1 = Doc(nlp.vocab, words=["Kirby", "123456"], spaces=[True, False], ents=["B-CHARACTER", "B-CARDINAL"]) + doc2 = Doc(nlp.vocab, words=["Kirby", "123", "456"], spaces=[True, False, False], ents=["B-CHARACTER", "B-CARDINAL", "B-CARDINAL"]) + + eg = Example(doc1, doc2) + train_examples = [eg] + vector_length = 3 + + def create_kb(vocab): + # create placeholder KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Kirby", ["Q613241"], [0.9]) + return mykb + + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + nlp.add_pipe("sentencizer", first=True) + results = nlp.evaluate(train_examples) From 32954c3bcb36ce751799adedf4dc09a020bb0cfb Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Wed, 25 May 2022 09:33:54 +0200 Subject: [PATCH 40/42] Fix issues for Mypy 0.950 and Pydantic 1.9.0 (#10786) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Make changes to typing * Correction * Format with black * Corrections based on review * Bumped Thinc dependency version * Bumped blis requirement * Correction for older Python versions * Update spacy/ml/models/textcat.py Co-authored-by: Daniël de Kok * Corrections based on review feedback * Readd deleted docstring line Co-authored-by: Daniël de Kok --- pyproject.toml | 4 ++-- requirements.txt | 8 ++++---- setup.cfg | 8 ++++---- spacy/errors.py | 6 +++++- spacy/lookups.py | 6 +++--- spacy/ml/models/entity_linker.py | 2 +- spacy/ml/models/parser.py | 2 +- spacy/ml/models/textcat.py | 16 +++++++++------- spacy/ml/models/tok2vec.py | 26 +++++++++++++------------- spacy/ml/staticvectors.py | 16 +++++++--------- spacy/pipeline/edit_tree_lemmatizer.py | 2 +- spacy/pipeline/entityruler.py | 6 ++---- spacy/pipeline/legacy/entity_linker.py | 7 +++---- spacy/pipeline/spancat.py | 2 +- spacy/schemas.py | 2 +- spacy/util.py | 11 ++++++----- 16 files changed, 63 insertions(+), 61 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a43b4c814..4caf46111 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,8 +5,8 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.14,<8.1.0", - "blis>=0.4.0,<0.8.0", + "thinc>=8.1.0.dev0,<8.2.0", + "blis>=0.9.0,<0.10.0", "pathy", "numpy>=1.15.0", ] diff --git a/requirements.txt b/requirements.txt index 619d35ebc..dcb594601 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,8 +3,8 @@ spacy-legacy>=3.0.9,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.14,<8.1.0 -blis>=0.4.0,<0.8.0 +thinc>=8.1.0.dev0,<8.2.0 +blis>=0.9.0,<0.10.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.1.0 @@ -16,7 +16,7 @@ pathy>=0.3.5 numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 +pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 langcodes>=3.2.0,<4.0.0 # Official Python utilities @@ -31,7 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<3.10.0 hypothesis>=3.27.0,<7.0.0 -mypy==0.910 +mypy>=0.910,<=0.960 types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-requests diff --git a/setup.cfg b/setup.cfg index 2626de87e..5f9d51885 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,7 +38,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.14,<8.1.0 + thinc>=8.1.0.dev0,<8.2.0 install_requires = # Our libraries spacy-legacy>=3.0.9,<3.1.0 @@ -46,8 +46,8 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.14,<8.1.0 - blis>=0.4.0,<0.8.0 + thinc>=8.1.0.dev0,<8.2.0 + blis>=0.9.0,<0.10.0 wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 @@ -57,7 +57,7 @@ install_requires = tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 - pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 + pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 # Official Python utilities setuptools diff --git a/spacy/errors.py b/spacy/errors.py index bff8e7414..67458fb52 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,4 +1,5 @@ import warnings +from .compat import Literal class ErrorsWithCodes(type): @@ -26,7 +27,10 @@ def setup_default_warnings(): filter_warning("once", error_msg="[W114]") -def filter_warning(action: str, error_msg: str): +def filter_warning( + action: Literal["default", "error", "ignore", "always", "module", "once"], + error_msg: str, +): """Customize how spaCy should handle a certain warning. error_msg (str): e.g. "W006", or a full error message diff --git a/spacy/lookups.py b/spacy/lookups.py index b2f3dc15e..d7cc44fb3 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -85,7 +85,7 @@ class Table(OrderedDict): value: The value to set. """ key = get_string_id(key) - OrderedDict.__setitem__(self, key, value) + OrderedDict.__setitem__(self, key, value) # type:ignore[assignment] self.bloom.add(key) def set(self, key: Union[str, int], value: Any) -> None: @@ -104,7 +104,7 @@ class Table(OrderedDict): RETURNS: The value. """ key = get_string_id(key) - return OrderedDict.__getitem__(self, key) + return OrderedDict.__getitem__(self, key) # type:ignore[index] def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any: """Get the value for a given key. String keys will be hashed. @@ -114,7 +114,7 @@ class Table(OrderedDict): RETURNS: The value. """ key = get_string_id(key) - return OrderedDict.get(self, key, default) + return OrderedDict.get(self, key, default) # type:ignore[arg-type] def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override] """Check whether a key is in the table. String keys will be hashed. diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 0149bea89..fba4b485f 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -23,7 +23,7 @@ def build_nel_encoder( ((tok2vec >> list2ragged()) & build_span_maker()) >> extract_spans() >> reduce_mean() - >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type] + >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) >> output_layer ) model.set_ref("output_layer", output_layer) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 63284e766..a70d84dea 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -72,7 +72,7 @@ def build_tb_parser_model( t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None tok2vec = chain( tok2vec, - cast(Model[List["Floats2d"], Floats2d], list2array()), + list2array(), Linear(hidden_width, t2v_width), ) tok2vec.set_dim("nO", hidden_width) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index c8c146f02..9c7e607fe 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,5 +1,5 @@ +from typing import Optional, List, cast from functools import partial -from typing import Optional, List from thinc.types import Floats2d from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic @@ -59,7 +59,8 @@ def build_simple_cnn_text_classifier( resizable_layer=resizable_layer, ) model.set_ref("tok2vec", tok2vec) - model.set_dim("nO", nO) # type: ignore # TODO: remove type ignore once Thinc has been updated + if nO is not None: + model.set_dim("nO", cast(int, nO)) model.attrs["multi_label"] = not exclusive_classes return model @@ -85,7 +86,7 @@ def build_bow_text_classifier( if not no_output_layer: fill_defaults["b"] = NEG_VALUE output_layer = softmax_activation() if exclusive_classes else Logistic() - resizable_layer = resizable( # type: ignore[var-annotated] + resizable_layer: Model[Floats2d, Floats2d] = resizable( sparse_linear, resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults), ) @@ -93,7 +94,8 @@ def build_bow_text_classifier( model = with_cpu(model, model.ops) if output_layer: model = model >> with_cpu(output_layer, output_layer.ops) - model.set_dim("nO", nO) # type: ignore[arg-type] + if nO is not None: + model.set_dim("nO", cast(int, nO)) model.set_ref("output_layer", sparse_linear) model.attrs["multi_label"] = not exclusive_classes model.attrs["resize_output"] = partial( @@ -129,8 +131,8 @@ def build_text_classifier_v2( output_layer = Linear(nO=nO, nI=nO_double) >> Logistic() model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) - if model.has_dim("nO") is not False: - model.set_dim("nO", nO) # type: ignore[arg-type] + if model.has_dim("nO") is not False and nO is not None: + model.set_dim("nO", cast(int, nO)) model.set_ref("output_layer", linear_model.get_ref("output_layer")) model.set_ref("attention_layer", attention_layer) model.set_ref("maxout_layer", maxout_layer) @@ -164,7 +166,7 @@ def build_text_classifier_lowdata( >> list2ragged() >> ParametricAttention(width) >> reduce_sum() - >> residual(Relu(width, width)) ** 2 # type: ignore[arg-type] + >> residual(Relu(width, width)) ** 2 >> Linear(nO, width) ) if dropout: diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index ecdf6be27..30c7360ff 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,5 +1,5 @@ from typing import Optional, List, Union, cast -from thinc.types import Floats2d, Ints2d, Ragged +from thinc.types import Floats2d, Ints2d, Ragged, Ints1d from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM @@ -159,7 +159,7 @@ def MultiHashEmbed( embeddings = [make_hash_embed(i) for i in range(len(attrs))] concat_size = width * (len(embeddings) + include_static_vectors) max_out: Model[Ragged, Ragged] = with_array( - Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True) # type: ignore + Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True) ) if include_static_vectors: feature_extractor: Model[List[Doc], Ragged] = chain( @@ -173,7 +173,7 @@ def MultiHashEmbed( StaticVectors(width, dropout=0.0), ), max_out, - cast(Model[Ragged, List[Floats2d]], ragged2list()), + ragged2list(), ) else: model = chain( @@ -181,7 +181,7 @@ def MultiHashEmbed( cast(Model[List[Ints2d], Ragged], list2ragged()), with_array(concatenate(*embeddings)), max_out, - cast(Model[Ragged, List[Floats2d]], ragged2list()), + ragged2list(), ) return model @@ -232,12 +232,12 @@ def CharacterEmbed( feature_extractor: Model[List[Doc], Ragged] = chain( FeatureExtractor([feature]), cast(Model[List[Ints2d], Ragged], list2ragged()), - with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), # type: ignore + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), # type: ignore[misc] ) max_out: Model[Ragged, Ragged] if include_static_vectors: max_out = with_array( - Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) # type: ignore + Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) ) model = chain( concatenate( @@ -246,11 +246,11 @@ def CharacterEmbed( StaticVectors(width, dropout=0.0), ), max_out, - cast(Model[Ragged, List[Floats2d]], ragged2list()), + ragged2list(), ) else: max_out = with_array( - Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) # type: ignore + Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) ) model = chain( concatenate( @@ -258,7 +258,7 @@ def CharacterEmbed( feature_extractor, ), max_out, - cast(Model[Ragged, List[Floats2d]], ragged2list()), + ragged2list(), ) return model @@ -289,10 +289,10 @@ def MaxoutWindowEncoder( normalize=True, ), ) - model = clone(residual(cnn), depth) # type: ignore[arg-type] + model = clone(residual(cnn), depth) model.set_dim("nO", width) receptive_field = window_size * depth - return with_array(model, pad=receptive_field) # type: ignore[arg-type] + return with_array(model, pad=receptive_field) @registry.architectures("spacy.MishWindowEncoder.v2") @@ -313,9 +313,9 @@ def MishWindowEncoder( expand_window(window_size=window_size), Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True), ) - model = clone(residual(cnn), depth) # type: ignore[arg-type] + model = clone(residual(cnn), depth) model.set_dim("nO", width) - return with_array(model) # type: ignore[arg-type] + return with_array(model) @registry.architectures("spacy.TorchBiLSTMEncoder.v1") diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 8d9b1af9b..04cfe912d 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -40,17 +40,15 @@ def forward( if not token_count: return _handle_empty(model.ops, model.get_dim("nO")) key_attr: int = model.attrs["key_attr"] - keys: Ints1d = model.ops.flatten( - cast(Sequence, [doc.to_array(key_attr) for doc in docs]) - ) + keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs]) vocab: Vocab = docs[0].vocab W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) if vocab.vectors.mode == Mode.default: - V = cast(Floats2d, model.ops.asarray(vocab.vectors.data)) + V = model.ops.asarray(vocab.vectors.data) rows = vocab.vectors.find(keys=keys) V = model.ops.as_contig(V[rows]) elif vocab.vectors.mode == Mode.floret: - V = cast(Floats2d, vocab.vectors.get_batch(keys)) + V = vocab.vectors.get_batch(keys) V = model.ops.as_contig(V) else: raise RuntimeError(Errors.E896) @@ -62,9 +60,7 @@ def forward( # Convert negative indices to 0-vectors # TODO: more options for UNK tokens vectors_data[rows < 0] = 0 - output = Ragged( - vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore - ) + output = Ragged(vectors_data, model.ops.asarray1i([len(doc) for doc in docs])) mask = None if is_train: mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate")) @@ -77,7 +73,9 @@ def forward( model.inc_grad( "W", model.ops.gemm( - cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True + cast(Floats2d, d_output.data), + cast(Floats2d, model.ops.as_contig(V)), + trans1=True, ), ) return [] diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 54a7030dc..b7d615f6d 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -138,7 +138,7 @@ class EditTreeLemmatizer(TrainablePipe): truths.append(eg_truths) - d_scores, loss = loss_func(scores, truths) # type: ignore + d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): raise ValueError(Errors.E910.format(name=self.name)) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 614d71f41..4307af793 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -159,10 +159,8 @@ class EntityRuler(Pipe): self._require_patterns() with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="\\[W036") - matches = cast( - List[Tuple[int, int, int]], - list(self.matcher(doc)) + list(self.phrase_matcher(doc)), - ) + matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) + final_matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] ) diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py index 6440c18e5..d723bdbe5 100644 --- a/spacy/pipeline/legacy/entity_linker.py +++ b/spacy/pipeline/legacy/entity_linker.py @@ -213,15 +213,14 @@ class EntityLinker_v1(TrainablePipe): if kb_id: entity_encoding = self.kb.get_vector(kb_id) entity_encodings.append(entity_encoding) - entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") + entity_encodings = self.model.ops.asarray2f(entity_encodings) if sentence_encodings.shape != entity_encodings.shape: err = Errors.E147.format( method="get_loss", msg="gold entities do not match up" ) raise RuntimeError(err) - # TODO: fix typing issue here - gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore - loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore + gradients = self.distance.get_grad(sentence_encodings, entity_encodings) + loss = self.distance.get_loss(sentence_encodings, entity_encodings) loss = loss / len(entity_encodings) return float(loss), gradients diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 0a6138fbc..1b7a9eecb 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -75,7 +75,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester: if spans: assert spans[-1].ndim == 2, spans[-1].shape lengths.append(length) - lengths_array = cast(Ints1d, ops.asarray(lengths, dtype="i")) + lengths_array = ops.asarray1i(lengths) if len(spans) > 0: output = Ragged(ops.xp.vstack(spans), lengths_array) else: diff --git a/spacy/schemas.py b/spacy/schemas.py index 1dfd8ee85..7d87658f2 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -104,7 +104,7 @@ def get_arg_model( sig_args[param.name] = (annotation, default) is_strict = strict and not has_variable sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment] - return create_model(name, **sig_args) # type: ignore[arg-type, return-value] + return create_model(name, **sig_args) # type: ignore[call-overload, arg-type, return-value] def validate_init_settings( diff --git a/spacy/util.py b/spacy/util.py index 66e257dd8..5ca6e4032 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,4 +1,4 @@ -from typing import List, Mapping, NoReturn, Union, Dict, Any, Set +from typing import List, Mapping, NoReturn, Union, Dict, Any, Set, cast from typing import Optional, Iterable, Callable, Tuple, Type from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING from types import ModuleType @@ -294,7 +294,7 @@ def find_matching_language(lang: str) -> Optional[str]: # Find out which language modules we have possible_languages = [] - for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore + for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined] code = modinfo.name if code == "xx": # Temporarily make 'xx' into a valid language code @@ -391,7 +391,8 @@ def get_module_path(module: ModuleType) -> Path: """ if not hasattr(module, "__module__"): raise ValueError(Errors.E169.format(module=repr(module))) - return Path(sys.modules[module.__module__].__file__).parent + file_path = Path(cast(os.PathLike, sys.modules[module.__module__].__file__)) + return file_path.parent def load_model( @@ -878,7 +879,7 @@ def get_package_path(name: str) -> Path: # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. pkg = importlib.import_module(name) - return Path(pkg.__file__).parent + return Path(cast(Union[str, os.PathLike], pkg.__file__)).parent def replace_model_node(model: Model, target: Model, replacement: Model) -> None: @@ -1675,7 +1676,7 @@ def packages_distributions() -> Dict[str, List[str]]: it's not available in the builtin importlib.metadata. """ pkg_to_dist = defaultdict(list) - for dist in importlib_metadata.distributions(): # type: ignore[attr-defined] + for dist in importlib_metadata.distributions(): for pkg in (dist.read_text("top_level.txt") or "").split(): pkg_to_dist[pkg].append(dist.metadata["Name"]) return dict(pkg_to_dist) From 83ed1f391bc6551477b6c97d7ffec61ac2cf80b3 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 25 May 2022 09:48:39 +0200 Subject: [PATCH 41/42] Remove NBSP's across tables in the docs (#10842) --- website/docs/api/cli.md | 18 ++++----- website/docs/api/corpus.md | 32 ++++++++-------- website/docs/api/language.md | 4 +- website/docs/api/matcher.md | 40 ++++++++++---------- website/docs/api/top-level.md | 46 +++++++++++------------ website/docs/usage/linguistic-features.md | 4 +- website/docs/usage/rule-based-matching.md | 34 ++++++++--------- website/docs/usage/v3-1.md | 14 +++---- website/docs/usage/v3.md | 8 ++-- 9 files changed, 98 insertions(+), 102 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index dd396d0b3..cbd1f794a 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -1335,7 +1335,7 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry] | `subcommand` | Name of the command or workflow to run. ~~str (positional)~~ | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | | `--force`, `-F` | Force re-running steps, even if nothing changed. ~~bool (flag)~~ | -| `--dry`, `-D` |  Perform a dry run and don't execute scripts. ~~bool (flag)~~ | +| `--dry`, `-D` | Perform a dry run and don't execute scripts. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **EXECUTES** | The command defined in the `project.yml`. | @@ -1453,12 +1453,12 @@ For more examples, see the templates in our -| Name | Description | -| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | -| `--output`, `-o` | Path to output file or `-` for stdout (default). If a file is specified and it already exists and contains auto-generated docs, only the auto-generated docs section is replaced. ~~Path (positional)~~ | -|  `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~ | -| **CREATES** | The Markdown-formatted project documentation. | +| Name | Description | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--output`, `-o` | Path to output file or `-` for stdout (default). If a file is specified and it already exists and contains auto-generated docs, only the auto-generated docs section is replaced. ~~Path (positional)~~ | +| `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~ | +| **CREATES** | The Markdown-formatted project documentation. | ### project dvc {#project-dvc tag="command"} @@ -1497,7 +1497,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | | `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ | | `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | -| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ | +| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | @@ -1588,5 +1588,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] | `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | | `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | | `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ | -| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~  | +| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ | | **UPLOADS** | The pipeline to the hub. | diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 35afc8fea..88c4befd7 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -37,13 +37,13 @@ streaming. > augmenter = null > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ | -|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | -| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | -| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | -| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ | +| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | +| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/training/corpus.py @@ -71,15 +71,15 @@ train/test skew. > corpus = Corpus("./data", limit=10) > ``` -| Name | Description | -| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | The directory or filename to read from. ~~Union[str, Path]~~ | -| _keyword-only_ | | -|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ | -| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | -| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | -| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | -| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ | +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ | +| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | +| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ | ## Corpus.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 8d7686243..9a413efaf 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -1123,7 +1123,7 @@ instance and factory instance. | `factory` | The name of the registered component factory. ~~str~~ | | `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  | -| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  | +| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | | `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 273c202ca..6c8cae211 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -30,26 +30,26 @@ pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute |  Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | -| `ORTH` | The exact verbatim text of a token. ~~str~~ | -| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | -| `NORM` | The normalized form of the token text. ~~str~~ | -| `LOWER` | The lowercase form of the token text. ~~str~~ | -|  `LENGTH` | The length of the token text. ~~int~~ | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | -|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | -|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | -|  `IS_SENT_START` | Token is start of sentence. ~~bool~~ | -|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | -| `SPACY` | Token has a trailing space. ~~bool~~ | -|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | -| `ENT_TYPE` | The token's entity label. ~~str~~ | -| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ | -| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | -| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | -| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | -| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ | +| Attribute | Description | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | +| `NORM` | The normalized form of the token text. ~~str~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| `LENGTH` | The length of the token text. ~~int~~ | +| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | +| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | +| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | +| `IS_SENT_START` | Token is start of sentence. ~~bool~~ | +| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | +| `SPACY` | Token has a trailing space. ~~bool~~ | +| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | +| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | +| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ | Operators and quantifiers define **how often** a token pattern should be matched: diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f2fd1415f..904a91ea9 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -320,7 +320,6 @@ If a setting is not present in the options, the default value will be used. | `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | | `kb_url_template` 3.2.1 | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~ | - #### Span Visualizer options {#displacy_options-span} > #### Example @@ -330,21 +329,19 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="span", options=options) > ``` -| Name | Description | -|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| -| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ | +| Name | Description | +| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ | | `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ | -| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ | -| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | - -By default, displaCy comes with colors for all entity types used by [spaCy's -trained pipelines](/models) for both entity and span visualizer. If you're -using custom entity types, you can use the `colors` setting to add your own -colors for them. Your application or pipeline package can also expose a -[`spacy_displacy_colors` entry -point](/usage/saving-loading#entry-points-displacy) to add custom labels and -their colors automatically. +By default, displaCy comes with colors for all entity types used by +[spaCy's trained pipelines](/models) for both entity and span visualizer. If +you're using custom entity types, you can use the `colors` setting to add your +own colors for them. Your application or pipeline package can also expose a +[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) +to add custom labels and their colors automatically. By default, displaCy links to `#` for entities without a `kb_id` set on their span. If you wish to link an entity to their URL then consider using the @@ -354,7 +351,6 @@ span. If you wish to link an entity to their URL then consider using the should redirect you to their Wikidata page, in this case `https://www.wikidata.org/wiki/Q95`. - ## registry {#registry source="spacy/util.py" new="3"} spaCy's function registry extends @@ -443,8 +439,8 @@ and the accuracy scores on the development set. The built-in, default logger is the ConsoleLogger, which prints results to the console in tabular format. The [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as -a dependency of spaCy, enables other loggers, such as one that -sends results to a [Weights & Biases](https://www.wandb.com/) dashboard. +a dependency of spaCy, enables other loggers, such as one that sends results to +a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). @@ -583,14 +579,14 @@ the [`Corpus`](/api/corpus) class. > limit = 0 > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ | -|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | -| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | -| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | -| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ | -| **CREATES** | The corpus reader. ~~Corpus~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ | +| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | +| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ | +| **CREATES** | The corpus reader. ~~Corpus~~ | #### spacy.JsonlCorpus.v1 {#jsonlcorpus tag="registered function"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index b3b896a54..c547ec0bc 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -48,7 +48,7 @@ but do not change its part-of-speech. We say that a **lemma** (root form) is **inflected** (modified/combined) with one or more **morphological features** to create a surface form. Here are some examples: -| Context | Surface | Lemma | POS |  Morphological Features | +| Context | Surface | Lemma | POS | Morphological Features | | ---------------------------------------- | ------- | ----- | ------ | ---------------------------------------- | | I was reading the paper | reading | read | `VERB` | `VerbForm=Ger` | | I don't watch the news, I read the paper | read | read | `VERB` | `VerbForm=Fin`, `Mood=Ind`, `Tense=Pres` | @@ -430,7 +430,7 @@ for token in doc: print(token.text, token.pos_, token.dep_, token.head.text) ``` -| Text |  POS | Dep | Head text | +| Text | POS | Dep | Head text | | ----------------------------------- | ------ | ------- | --------- | | Credit and mortgage account holders | `NOUN` | `nsubj` | submit | | must | `VERB` | `aux` | submit | diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index be9a56dc8..bf654c14f 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -158,23 +158,23 @@ The available token pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute |  Description | -| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `ORTH` | The exact verbatim text of a token. ~~str~~ | -| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | -| `NORM` | The normalized form of the token text. ~~str~~ | -| `LOWER` | The lowercase form of the token text. ~~str~~ | -|  `LENGTH` | The length of the token text. ~~int~~ | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | -|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | -|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | -|  `IS_SENT_START` | Token is start of sentence. ~~bool~~ | -|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | -| `SPACY` | Token has a trailing space. ~~bool~~ | -|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ | -| `ENT_TYPE` | The token's entity label. ~~str~~ | -| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | -| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ | +| Attribute | Description | +| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | +| `NORM` | The normalized form of the token text. ~~str~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| `LENGTH` | The length of the token text. ~~int~~ | +| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | +| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | +| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | +| `IS_SENT_START` | Token is start of sentence. ~~bool~~ | +| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | +| `SPACY` | Token has a trailing space. ~~bool~~ | +| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | +| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ | diff --git a/website/docs/usage/v3-1.md b/website/docs/usage/v3-1.md index 1bac8fd81..2725cacb9 100644 --- a/website/docs/usage/v3-1.md +++ b/website/docs/usage/v3-1.md @@ -132,13 +132,13 @@ your own. > contributions for Catalan and to Kenneth Enevoldsen for Danish. For additional > Danish pipelines, check out [DaCy](https://github.com/KennethEnevoldsen/DaCy). -| Package | Language | UPOS | Parser LAS |  NER F | -| ------------------------------------------------- | -------- | ---: | ---------: | -----: | -| [`ca_core_news_sm`](/models/ca#ca_core_news_sm) | Catalan | 98.2 | 87.4 | 79.8 | -| [`ca_core_news_md`](/models/ca#ca_core_news_md) | Catalan | 98.3 | 88.2 | 84.0 | -| [`ca_core_news_lg`](/models/ca#ca_core_news_lg) | Catalan | 98.5 | 88.4 | 84.2 | -| [`ca_core_news_trf`](/models/ca#ca_core_news_trf) | Catalan | 98.9 | 93.0 | 91.2 | -| [`da_core_news_trf`](/models/da#da_core_news_trf) | Danish | 98.0 | 85.0 | 82.9 | +| Package | Language | UPOS | Parser LAS | NER F | +| ------------------------------------------------- | -------- | ---: | ---------: | ----: | +| [`ca_core_news_sm`](/models/ca#ca_core_news_sm) | Catalan | 98.2 | 87.4 | 79.8 | +| [`ca_core_news_md`](/models/ca#ca_core_news_md) | Catalan | 98.3 | 88.2 | 84.0 | +| [`ca_core_news_lg`](/models/ca#ca_core_news_lg) | Catalan | 98.5 | 88.4 | 84.2 | +| [`ca_core_news_trf`](/models/ca#ca_core_news_trf) | Catalan | 98.9 | 93.0 | 91.2 | +| [`da_core_news_trf`](/models/da#da_core_news_trf) | Danish | 98.0 | 85.0 | 82.9 | ### Resizable text classification architectures {#resizable-textcat} diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 980f06172..971779ed3 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -116,7 +116,7 @@ import Benchmarks from 'usage/\_benchmarks-models.md' > corpus that had both syntactic and entity annotations, so the transformer > models for those languages do not include NER. -| Package | Language | Transformer | Tagger | Parser |  NER | +| Package | Language | Transformer | Tagger | Parser | NER | | ------------------------------------------------ | -------- | --------------------------------------------------------------------------------------------- | -----: | -----: | ---: | | [`en_core_web_trf`](/models/en#en_core_web_trf) | English | [`roberta-base`](https://huggingface.co/roberta-base) | 97.8 | 95.2 | 89.9 | | [`de_dep_news_trf`](/models/de#de_dep_news_trf) | German | [`bert-base-german-cased`](https://huggingface.co/bert-base-german-cased) | 99.0 | 95.8 | - | @@ -856,9 +856,9 @@ attribute ruler before training using the `[initialize]` block of your config. ### Using Lexeme Tables -To use tables like `lexeme_prob` when training a model from scratch, you need -to add an entry to the `initialize` block in your config. Here's what that -looks like for the existing trained pipelines: +To use tables like `lexeme_prob` when training a model from scratch, you need to +add an entry to the `initialize` block in your config. Here's what that looks +like for the existing trained pipelines: ```ini [initialize.lookups] From f6a4b80c0b76f93f1714b8f2e6f1cb87d16e49f9 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Wed, 25 May 2022 11:12:29 +0200 Subject: [PATCH 42/42] Better errors for has_annotation and Matcher (#10830) * Show input argument instead of None * catch invalid attr early * moved error message from code to errors.py * Update spacy/errors.py Co-authored-by: Adriane Boyd * Update spacy/errors.py * update E153 and E154 Co-authored-by: Adriane Boyd --- spacy/errors.py | 5 +++-- spacy/matcher/matcher.pyx | 3 ++- spacy/tokens/doc.pyx | 5 +++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 67458fb52..c82ffe882 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -449,10 +449,10 @@ class Errors(metaclass=ErrorsWithCodes): "same, but found '{nlp}' and '{vocab}' respectively.") E152 = ("The attribute {attr} is not supported for token patterns. " "Please use the option `validate=True` with the Matcher, PhraseMatcher, " - "or EntityRuler for more details.") + "EntityRuler or AttributeRuler for more details.") E153 = ("The value type {vtype} is not supported for token patterns. " "Please use the option validate=True with Matcher, PhraseMatcher, " - "or EntityRuler for more details.") + "EntityRuler or AttributeRuler for more details.") E154 = ("One of the attributes or values is not supported for token " "patterns. Please use the option `validate=True` with the Matcher, " "PhraseMatcher, or EntityRuler for more details.") @@ -918,6 +918,7 @@ class Errors(metaclass=ErrorsWithCodes): E1034 = ("Node index {i} out of bounds ({length})") E1035 = ("Token index {i} out of bounds ({length})") E1036 = ("Cannot index into NoneNode") + E1037 = ("Invalid attribute value '{attr}'.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index e43583e30..981c5cdd2 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -786,6 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): def _get_attr_values(spec, string_store): attr_values = [] for attr, value in spec.items(): + input_attr = attr if isinstance(attr, str): attr = attr.upper() if attr == '_': @@ -814,7 +815,7 @@ def _get_attr_values(spec, string_store): attr_values.append((attr, value)) else: # should be caught in validation - raise ValueError(Errors.E152.format(attr=attr)) + raise ValueError(Errors.E152.format(attr=input_attr)) return attr_values diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c36e3a02f..d25247b13 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -414,6 +414,7 @@ cdef class Doc: """ # empty docs are always annotated + input_attr = attr if self.length == 0: return True cdef int i @@ -423,6 +424,10 @@ cdef class Doc: elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]: attr = SENT_START attr = intify_attr(attr) + if attr is None: + raise ValueError( + Errors.E1037.format(attr=input_attr) + ) # adjust attributes if attr == HEAD: # HEAD does not have an unset state, so rely on DEP