From dc5be7d2f35ca34fccbfb9e33ecf9dd7160899c5 Mon Sep 17 00:00:00 2001 From: mollerhoj Date: Mon, 3 Jul 2017 15:40:58 +0200 Subject: [PATCH 01/99] Cleanup list of Danish stopwords --- spacy/lang/da/stop_words.py | 43 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py index ac2195f10..ba448f8f3 100644 --- a/spacy/lang/da/stop_words.py +++ b/spacy/lang/da/stop_words.py @@ -1,47 +1,46 @@ # encoding: utf8 from __future__ import unicode_literals - -# Source: https://github.com/stopwords-iso/stopwords-da +# Source: Handpicked by Jens Dahl Møllerhøj. STOP_WORDS = set(""" -ad af aldrig alle alt anden andet andre at +af aldrig alene alle allerede alligevel alt altid anden andet andre at -bare begge blev blive bliver +bag begge blandt blev blive bliver burde bør -da de dem den denne der deres det dette dig din dine disse dit dog du +da de dem den denne dens der derefter deres derfor derfra deri dermed derpå derved det dette dig din dine disse dog du -efter ej eller en end ene eneste enhver er et +efter egen eller ellers en end endnu ene eneste enhver ens enten er et -far fem fik fire flere fleste for fordi forrige fra få får før +flere flest fleste for foran fordi forrige fra få før først -god godt +gennem gjorde gjort god gør gøre gørende -ham han hans har havde have hej helt hende hendes her hos hun hvad hvem hver -hvilken hvis hvor hvordan hvorfor hvornår +ham han hans har havde have hel heller hen hende hendes henover her herefter heri hermed herpå hun hvad hvem hver hvilke hvilken hvilkes hvis hvor hvordan hvorefter hvorfor hvorfra hvorhen hvori hvorimod hvornår hvorved -i ikke ind ingen intet +i igen igennem ikke imellem imens imod ind indtil ingen intet -ja jeg jer jeres jo +jeg jer jeres jo -kan kom komme kommer kun kunne +kan kom kommer kun kunne -lad lav lidt lige lille +lad langs lav lave lavet lidt lige ligesom lille længere -man mand mange med meget men mens mere mig min mine mit mod må +man mange med meget mellem men mens mere mest mig min mindre mindst mine mit må måske -ned nej ni nogen noget nogle nu ny nyt når nær næste næsten +ned nemlig nogen nogensinde noget nogle nok nu ny nyt nær næste næsten -og også okay om op os otte over +og også om omkring op os over overalt på -se seks selv ser ses sig sige sin sine sit skal skulle som stor store syv så -sådan +samme sammen selv selvom senere ses siden sig sige skal skulle som stadig synes syntes så sådan således -tag tage thi ti til to tre +temmelig tidligere til tilbage tit -ud under +ud uden udover under undtagen -var ved vi vil ville vor vores være været +var ved vi via vil ville vore vores vær være været + +øvrigt """.split()) From 23025d3b05572a840ec91301092f8bee68cb1753 Mon Sep 17 00:00:00 2001 From: mollerhoj Date: Mon, 3 Jul 2017 15:41:59 +0200 Subject: [PATCH 02/99] Clean up a couple of strange English stopwords --- spacy/lang/en/stop_words.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 640940fea..394731ff1 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -16,7 +16,7 @@ call can cannot ca could did do does doing done down due during -each eight either eleven else elsewhere empty enough etc even ever every +each eight either eleven else elsewhere empty enough even ever every everyone everything everywhere except few fifteen fifty first five for former formerly forty four from front full @@ -27,7 +27,7 @@ get give go had has have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred -i if in inc indeed into is it its itself +i if in indeed into is it its itself keep From e8400776012931e414599905b8d2923fe78ab458 Mon Sep 17 00:00:00 2001 From: mollerhoj Date: Mon, 3 Jul 2017 15:43:06 +0200 Subject: [PATCH 03/99] Add some basic tests for Danish --- spacy/tests/conftest.py | 3 +++ spacy/tests/lang/da/__init__.py | 0 spacy/tests/lang/da/test_exceptions.py | 15 ++++++++++++++ spacy/tests/lang/da/test_text.py | 27 ++++++++++++++++++++++++++ 4 files changed, 45 insertions(+) create mode 100644 spacy/tests/lang/da/__init__.py create mode 100644 spacy/tests/lang/da/test_exceptions.py create mode 100644 spacy/tests/lang/da/test_text.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 200f9ff4f..b6232970a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -105,6 +105,9 @@ def he_tokenizer(): def nb_tokenizer(): return util.get_lang_class('nb').Defaults.create_tokenizer() +@pytest.fixture +def da_tokenizer(): + return util.get_lang_class('da').Defaults.create_tokenizer() @pytest.fixture def stringstore(): diff --git a/spacy/tests/lang/da/__init__.py b/spacy/tests/lang/da/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py new file mode 100644 index 000000000..d89fafd2c --- /dev/null +++ b/spacy/tests/lang/da/test_exceptions.py @@ -0,0 +1,15 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +@pytest.mark.parametrize('text', ["ca.", "m.a.o.", "Jan.", "Dec."]) +def test_da_tokenizer_handles_abbr(da_tokenizer, text): + tokens = da_tokenizer(text) + assert len(tokens) == 1 + +def test_da_tokenizer_handles_exc_in_text(da_tokenizer): + text = "Det er bl.a. ikke meningen" + tokens = da_tokenizer(text) + assert len(tokens) == 5 + assert tokens[2].text == "bl.a." diff --git a/spacy/tests/lang/da/test_text.py b/spacy/tests/lang/da/test_text.py new file mode 100644 index 000000000..fa6a935f6 --- /dev/null +++ b/spacy/tests/lang/da/test_text.py @@ -0,0 +1,27 @@ +# coding: utf-8 +"""Test that longer and mixed texts are tokenized correctly.""" + + +from __future__ import unicode_literals + +import pytest + +def test_da_tokenizer_handles_long_text(da_tokenizer): + text = """Der var så dejligt ude på landet. Det var sommer, kornet stod gult, havren grøn, +høet var rejst i stakke nede i de grønne enge, og der gik storken på sine lange, +røde ben og snakkede ægyptisk, for det sprog havde han lært af sin moder. + +Rundt om ager og eng var der store skove, og midt i skovene dybe søer; jo, der var rigtignok dejligt derude på landet!""" + tokens = da_tokenizer(text) + assert len(tokens) == 84 + +@pytest.mark.parametrize('text,match', [ + ('10', True), ('1', True), ('10.000', True), ('10.00', True), + ('999,0', True), ('en', True), ('treoghalvfemsindstyvende', True), ('hundrede', True), + ('hund', False), (',', False), ('1/2', True)]) +def test_lex_attrs_like_number(da_tokenizer, text, match): + tokens = da_tokenizer(text) + assert len(tokens) == 1 + print(tokens[0]) + assert tokens[0].like_num == match + From e8f40ceed8d259df3102dc68bbb13cdb34d704f1 Mon Sep 17 00:00:00 2001 From: mollerhoj Date: Mon, 3 Jul 2017 15:44:17 +0200 Subject: [PATCH 04/99] Add short names of months to tokenizer_exceptions --- spacy/lang/da/tokenizer_exceptions.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index fbfbbad86..6bf9ab669 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,11 +1,27 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH, LEMMA, NORM _exc = {} +for exc_data in [ + {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, + + {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"}, + {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"}, + {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"}, + {ORTH: "Apr.", LEMMA: "april", NORM: "april"}, + {ORTH: "Maj.", LEMMA: "maj", NORM: "maj"}, + {ORTH: "Jun.", LEMMA: "juni", NORM: "juni"}, + {ORTH: "Jul.", LEMMA: "juli", NORM: "juli"}, + {ORTH: "Aug.", LEMMA: "august", NORM: "august"}, + {ORTH: "Sep.", LEMMA: "september", NORM: "september"}, + {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"}, + {ORTH: "Nov.", LEMMA: "november", NORM: "november"}, + {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]: + _exc[exc_data[ORTH]] = [dict(exc_data)] for orth in [ "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.", From 3b2cb107a37804b89792b1993088e59a78d26323 Mon Sep 17 00:00:00 2001 From: mollerhoj Date: Mon, 3 Jul 2017 15:45:31 +0200 Subject: [PATCH 05/99] Add like_num functionality to Danish --- spacy/lang/da/__init__.py | 2 ++ spacy/lang/da/lex_attrs.py | 52 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 spacy/lang/da/lex_attrs.py diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 99babdc2c..1dc4d4820 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -13,6 +14,7 @@ from ...util import update_exc, add_lookups class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py new file mode 100644 index 000000000..8152ad259 --- /dev/null +++ b/spacy/lang/da/lex_attrs.py @@ -0,0 +1,52 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +# Source http://fjern-uv.dk/tal.php + +_num_words = """nul +en et to tre fire fem seks syv otte ni ti +elleve tolv tretten fjorten femten seksten sytten atten nitten tyve +enogtyve toogtyve treogtyve fireogtyve femogtyve seksogtyve syvogtyve otteogtyve niogtyve tredive +enogtredive toogtredive treogtredive fireogtredive femogtredive seksogtredive syvogtredive otteogtredive niogtredive fyrre +enogfyrre toogfyrre treogfyrre fireogfyrre femgogfyrre seksogfyrre syvogfyrre otteogfyrre niogfyrre halvtreds +enoghalvtreds tooghalvtreds treoghalvtreds fireoghalvtreds femoghalvtreds seksoghalvtreds syvoghalvtreds otteoghalvtreds nioghalvtreds tres +enogtres toogtres treogtres fireogtres femogtres seksogtres syvogtres otteogtres niogtres halvfjerds +enoghalvfjerds tooghalvfjerds treoghalvfjerds fireoghalvfjerds femoghalvfjerds seksoghalvfjerds syvoghalvfjerds otteoghalvfjerds nioghalvfjerds firs +enogfirs toogfirs treogfirs fireogfirs femogfirs seksogfirs syvogfirs otteogfirs niogfirs halvfems +enoghalvfems tooghalvfems treoghalvfems fireoghalvfems femoghalvfems seksoghalvfems syvoghalvfems otteoghalvfems nioghalvfems hundrede +million milliard billion billiard trillion trilliard +""".split() + +# source http://www.duda.dk/video/dansk/grammatik/talord/talord.html + +_ordinal_words = """nulte +første anden tredje fjerde femte sjette syvende ottende niende tiende +elfte tolvte trettende fjortende femtende sekstende syttende attende nittende tyvende +enogtyvende toogtyvende treogtyvende fireogtyvende femogtyvende seksogtyvende syvogtyvende otteogtyvende niogtyvende tredivte enogtredivte toogtredivte treogtredivte fireogtredivte femogtredivte seksogtredivte syvogtredivte otteogtredivte niogtredivte fyrretyvende +enogfyrretyvende toogfyrretyvende treogfyrretyvende fireogfyrretyvende femogfyrretyvende seksogfyrretyvende syvogfyrretyvende otteogfyrretyvende niogfyrretyvende halvtredsindstyvende enoghalvtredsindstyvende +tooghalvtredsindstyvende treoghalvtredsindstyvende fireoghalvtredsindstyvende femoghalvtredsindstyvende seksoghalvtredsindstyvende syvoghalvtredsindstyvende otteoghalvtredsindstyvende nioghalvtredsindstyvende +tresindstyvende enogtresindstyvende toogtresindstyvende treogtresindstyvende fireogtresindstyvende femogtresindstyvende seksogtresindstyvende syvogtresindstyvende otteogtresindstyvende niogtresindstyvende halvfjerdsindstyvende +enoghalvfjerdsindstyvende tooghalvfjerdsindstyvende treoghalvfjerdsindstyvende fireoghalvfjerdsindstyvende femoghalvfjerdsindstyvende seksoghalvfjerdsindstyvende syvoghalvfjerdsindstyvende otteoghalvfjerdsindstyvende nioghalvfjerdsindstyvende firsindstyvende +enogfirsindstyvende toogfirsindstyvende treogfirsindstyvende fireogfirsindstyvende femogfirsindstyvende seksogfirsindstyvende syvogfirsindstyvende otteogfirsindstyvende niogfirsindstyvende halvfemsindstyvende +enoghalvfemsindstyvende tooghalvfemsindstyvende treoghalvfemsindstyvende fireoghalvfemsindstyvende femoghalvfemsindstyvende seksoghalvfemsindstyvende syvoghalvfemsindstyvende otteoghalvfemsindstyvende nioghalvfemsindstyvende +""".split() + +def like_num(text): + text = text.replace(',', '').replace('.', '') + if text.isdigit(): + return True + if text.count('/') == 1: + num, denom = text.split('/') + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + if text in _ordinal_words: + return True + return False + +LEX_ATTRS = { + LIKE_NUM: like_num +} From 64c732918a39907860d4107b9d25281152b32fe1 Mon Sep 17 00:00:00 2001 From: mollerhoj Date: Mon, 3 Jul 2017 15:49:09 +0200 Subject: [PATCH 06/99] Add Morph_rules. (TODO: Not working?) --- spacy/lang/da/__init__.py | 2 ++ spacy/lang/da/morph_rules.py | 41 ++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 spacy/lang/da/morph_rules.py diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 1dc4d4820..d83ad8048 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .morph_rules import MORPH_RULES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -19,6 +20,7 @@ class DanishDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + #morph_rules = dict(MORPH_RULES) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/da/morph_rules.py b/spacy/lang/da/morph_rules.py new file mode 100644 index 000000000..b365bf871 --- /dev/null +++ b/spacy/lang/da/morph_rules.py @@ -0,0 +1,41 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import LEMMA +from ...deprecated import PRON_LEMMA + +MORPH_RULES = { + "PRON": { + "jeg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, + "mig": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"}, + "du": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"}, + "han": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"}, + "ham": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, + "hun": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"}, + "hende": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"}, + "den": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "det": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "vi": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"}, + "os": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"}, + "de": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"}, + "dem": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, + + "min": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, + "din": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, + "hans": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, + "hendes": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, + "dens": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, + "dets": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, + "vores": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + "deres": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + }, + + "VERB": { + "er": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Pres"}, + "var": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Past"} + } +} + +for tag, rules in MORPH_RULES.items(): + for key, attrs in dict(rules).items(): + rules[key.title()] = attrs From 85144835dab55336e07f5c806f3cd54911fea9e2 Mon Sep 17 00:00:00 2001 From: mollerhoj Date: Mon, 3 Jul 2017 15:51:58 +0200 Subject: [PATCH 07/99] Add Tag_map for Danish --- spacy/lang/da/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index d83ad8048..5f6cb867b 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES +from ..tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -21,6 +22,7 @@ class DanishDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) #morph_rules = dict(MORPH_RULES) + tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) From d71702b8274cfb61153a76f97713637ba239adac Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 20:10:37 +0200 Subject: [PATCH 08/99] Fix formatting --- website/api/_annotation/_biluo.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/api/_annotation/_biluo.jade b/website/api/_annotation/_biluo.jade index dc6168732..34d93f768 100644 --- a/website/api/_annotation/_biluo.jade +++ b/website/api/_annotation/_biluo.jade @@ -1,6 +1,6 @@ //- 💫 DOCS > API > ANNOTATION > BILUO -+table([ "Tag", "Description" ]) ++table(["Tag", "Description"]) +row +cell #[code #[span.u-color-theme B] EGIN] +cell The first token of a multi-token entity. From 7459ecfa87cc41f6195a4f49a5842c0eb1879dd8 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 20:13:34 +0200 Subject: [PATCH 09/99] Port over contributor agreements --- .github/CONTRIBUTOR_AGREEMENT.md | 10 +-- .github/contributors/demfier.md | 106 +++++++++++++++++++++++++++ .github/contributors/honnibal.md | 106 +++++++++++++++++++++++++++ .github/contributors/ines.md | 106 +++++++++++++++++++++++++++ .github/contributors/jerbob92.md | 106 +++++++++++++++++++++++++++ .github/contributors/johnhaley81.md | 106 +++++++++++++++++++++++++++ .github/contributors/mdcclv.md | 106 +++++++++++++++++++++++++++ .github/contributors/polm.md | 106 +++++++++++++++++++++++++++ .github/contributors/shuvanon.md | 108 ++++++++++++++++++++++++++++ .github/contributors/yuukos.md | 106 +++++++++++++++++++++++++++ 10 files changed, 961 insertions(+), 5 deletions(-) create mode 100644 .github/contributors/demfier.md create mode 100644 .github/contributors/honnibal.md create mode 100644 .github/contributors/ines.md create mode 100644 .github/contributors/jerbob92.md create mode 100644 .github/contributors/johnhaley81.md create mode 100644 .github/contributors/mdcclv.md create mode 100644 .github/contributors/polm.md create mode 100644 .github/contributors/shuvanon.md create mode 100644 .github/contributors/yuukos.md diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index c915d48bf..f34603065 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,8 +87,8 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [x] I am signing on behalf of myself as an individual and no other person - or entity, including my employer, has or will have rights with respect my + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my contributions. * [ ] I am signing on behalf of my employer or a legal entity and I have the @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Shuvanon Razik | +| Name | | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | 3/12/2017 | -| GitHub username | shuvanon | +| Date | | +| GitHub username | | | Website (optional) | | diff --git a/.github/contributors/demfier.md b/.github/contributors/demfier.md new file mode 100644 index 000000000..1a730fc78 --- /dev/null +++ b/.github/contributors/demfier.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Gaurav Sahu | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-18 | +| GitHub username | demfier | +| Website (optional) | | diff --git a/.github/contributors/honnibal.md b/.github/contributors/honnibal.md new file mode 100644 index 000000000..3a700b7dd --- /dev/null +++ b/.github/contributors/honnibal.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Matthew Honnibal | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017-10-18 | +| GitHub username | honnibal | +| Website (optional) | https://explosion.ai | diff --git a/.github/contributors/ines.md b/.github/contributors/ines.md new file mode 100644 index 000000000..5cd57b07e --- /dev/null +++ b/.github/contributors/ines.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ines Montani | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017/10/18 | +| GitHub username | ines | +| Website (optional) | https://explosion.ai | diff --git a/.github/contributors/jerbob92.md b/.github/contributors/jerbob92.md new file mode 100644 index 000000000..bb0430d14 --- /dev/null +++ b/.github/contributors/jerbob92.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jeroen Bobbeldijk | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22-10-2017 | +| GitHub username | jerbob92 | +| Website (optional) | | diff --git a/.github/contributors/johnhaley81.md b/.github/contributors/johnhaley81.md new file mode 100644 index 000000000..277b3126c --- /dev/null +++ b/.github/contributors/johnhaley81.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | John Haley | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 19/10/2017 | +| GitHub username | johnhaley81 | +| Website (optional) | | diff --git a/.github/contributors/mdcclv.md b/.github/contributors/mdcclv.md new file mode 100644 index 000000000..14ebfae26 --- /dev/null +++ b/.github/contributors/mdcclv.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------------- | +| Name | Orion Montoya | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 04-10-2017 | +| GitHub username | mdcclv | +| Website (optional) | http://www.mdcclv.com/ | diff --git a/.github/contributors/polm.md b/.github/contributors/polm.md new file mode 100644 index 000000000..a2aa0cb65 --- /dev/null +++ b/.github/contributors/polm.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Paul McCann | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-14 | +| GitHub username | polm | +| Website (optional) | http://dampfkraft.com| diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md new file mode 100644 index 000000000..82d02d8d2 --- /dev/null +++ b/.github/contributors/shuvanon.md @@ -0,0 +1,108 @@ + + +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Shuvanon Razik | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 3/12/2017 | +| GitHub username | shuvanon | +| Website (optional) | | diff --git a/.github/contributors/yuukos.md b/.github/contributors/yuukos.md new file mode 100644 index 000000000..aecafeecb --- /dev/null +++ b/.github/contributors/yuukos.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alexey Kim | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 13-12-2017 | +| GitHub username | yuukos | +| Website (optional) | | From c815ff65f6986302bf6d89c7747e53bcbc65ee9e Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 21:38:53 +0200 Subject: [PATCH 10/99] Update feature list --- website/index.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/index.jade b/website/index.jade index 0155ab295..1abe5a984 100644 --- a/website/index.jade +++ b/website/index.jade @@ -79,12 +79,12 @@ include _includes/_mixins +h(2) Features +list +item Non-destructive #[strong tokenization] + +item #[strong Named entity] recognition +item Support for #[strong #{LANG_COUNT}+ languages] +item #[strong #{MODEL_COUNT} statistical models] for #{MODEL_LANG_COUNT} languages +item Pre-trained #[strong word vectors] +item Easy #[strong deep learning] integration +item Part-of-speech tagging - +item #[strong Named entity] recognition +item Labelled dependency parsing +item Syntax-driven sentence segmentation +item Built in #[strong visualizers] for syntax and NER From 63683a515132eef4e8668e51f6ed65066080cb67 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 21:39:05 +0200 Subject: [PATCH 11/99] Port over contributors from master --- CONTRIBUTORS.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b64dc8db3..edd1ed30d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -3,6 +3,8 @@ This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work! * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer) +* Alexey Kim, [@yuukos](https://github.com/yuukos) +* Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman) * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv) * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary) @@ -16,6 +18,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo) * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi) * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28) +* Francisco Aranda, [@frascuchon](https://github.com/frascuchon) * Greg Baker, [@solresol](https://github.com/solresol) * Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard) * György Orosz, [@oroszgy](https://github.com/oroszgy) @@ -24,6 +27,9 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Ines Montani, [@ines](https://github.com/ines) * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) +* Jim Geovedi, [@geovedi](https://github.com/geovedi) +* Jim Regan, [@jimregan](https://github.com/jimregan) +* Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG) * Jordan Suchow, [@suchow](https://github.com/suchow) * Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) @@ -38,6 +44,8 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Michael Wallin, [@wallinm1](https://github.com/wallinm1) * Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) +* Orion Montoya, [@mdcclv](https://github.com/mdcclv) +* Paul O'Leary McCann, [@polm](https://github.com/polm) * Pokey Rule, [@pokey](https://github.com/pokey) * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort) @@ -45,12 +53,18 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Sam Bozek, [@sambozek](https://github.com/sambozek) * Sasho Savkov, [@savkov](https://github.com/savkov) * Shuvanon Razik, [@shuvanon](https://github.com/shuvanon) +* Swier, [@swierh](https://github.com/swierh) * Thomas Tanon, [@Tpt](https://github.com/Tpt) * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues) +* Vimos Tan, [@Vimos](https://github.com/Vimos) * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov) * Wah Loon Keng, [@kengz](https://github.com/kengz) +* Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom) * Willem van Hage, [@wrvhage](https://github.com/wrvhage) * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker) +* Yam, [@hscspring](https://github.com/hscspring) * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) * Yasuaki Uechi, [@uetchy](https://github.com/uetchy) +* Yu-chun Huang, [@galaxyh](https://github.com/galaxyh) * Yubing Dong, [@tomtung](https://github.com/tomtung) +* Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter) From 972d9e832cc782bdc50693b0cf8c62f3ee247c7d Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 21:39:18 +0200 Subject: [PATCH 12/99] Update README for v2.0 --- README.rst | 256 +++++++++++++++++++---------------------------------- 1 file changed, 93 insertions(+), 163 deletions(-) diff --git a/README.rst b/README.rst index 244308473..27fca3fc2 100644 --- a/README.rst +++ b/README.rst @@ -1,15 +1,16 @@ spaCy: Industrial-strength NLP ****************************** -spaCy is a library for advanced natural language processing in Python and +spaCy is a library for advanced Natural Language Processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. -It was designed from day one to be used in real products. spaCy currently supports -English, German, French and Spanish, as well as tokenization for Italian, -Portuguese, Dutch, Swedish, Finnish, Norwegian, Danish, Hungarian, Polish, -Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software, -released under the MIT license. +It was designed from day one to be used in real products. spaCy comes with +`pre-trained statistical models `_ and word +vectors, and currently supports tokenization for **20+ languages**. It features +the **fastest syntactic parser** in the world, convolutional **neural network models** +for tagging, parsing and **named entity recognition** and easy **deep learning** +integration. It's commercial open-source software, released under the MIT license. -💫 **Version 1.8 out now!** `Read the release notes here. `_ +💫 **Version 2.0 out now!** `Check out the new features here. `_ .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy @@ -38,68 +39,72 @@ released under the MIT license. 📖 Documentation ================ -=================== === -`Usage Workflows`_ How to use spaCy and its features. -`API Reference`_ The detailed reference for spaCy's API. -`Troubleshooting`_ Common problems and solutions for beginners. -`Tutorials`_ End-to-end examples, with code you can modify and run. -`Showcase & Demos`_ Demos, libraries and products from the spaCy community. -`Contribute`_ How to contribute to the spaCy project and code base. -=================== === +=================== === +`spaCy 101`_ New to spaCy? Here's everything you need to know! +`Usage Guides`_ How to use spaCy and its features. +`New in v2.0`_ New features, backwards incompatibilitiies and migration guide. +`API Reference`_ The detailed reference for spaCy's API. +`Models`_ Download statistical language models for spaCy. +`Resources`_ Libraries, extensions, demos, books and courses. +`Changelog`_ Changes and version history. +`Contribute`_ How to contribute to the spaCy project and code base. +=================== === -.. _Usage Workflows: https://spacy.io/docs/usage/ -.. _API Reference: https://spacy.io/docs/api/ -.. _Troubleshooting: https://spacy.io/docs/usage/troubleshooting -.. _Tutorials: https://spacy.io/docs/usage/tutorials -.. _Showcase & Demos: https://spacy.io/docs/usage/showcase +.. _spaCy 101: https://alpha.spacy.io/usage/spacy-101 +.. _New in v2.0: https://alpha.spacy.io/usage/v2#migrating +.. _Usage Guides: https://alpha.spacy.io/usage/ +.. _API Reference: https://alpha.spacy.io/api/ +.. _Models: https://alpha.spacy.io/models +.. _Resources: https://alpha.spacy.io/usage/resources +.. _Changelog: https://alpha.spacy.io/usage/#changelog .. _Contribute: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md 💬 Where to ask questions ========================== +The spaCy project is maintained by `@honnibal `_ +and `@ines `_. Please understand that we won't be able +to provide individual support via email. We also believe that help is much more +valuable if it's shared publicly, so that more people can benefit from it. + ====================== === -**Bug reports** `GitHub issue tracker`_ -**Usage questions** `StackOverflow`_, `Gitter chat`_, `Reddit user group`_ -**General discussion** `Gitter chat`_, `Reddit user group`_ -**Commercial support** contact@explosion.ai +**Bug Reports** `GitHub Issue Tracker`_ +**Usage Questions** `StackOverflow`_, `Gitter Chat`_, `Reddit User Group`_ +**General Discussion** `Gitter Chat`_, `Reddit User Group`_ ====================== === -.. _GitHub issue tracker: https://github.com/explosion/spaCy/issues +.. _GitHub Issue Tracker: https://github.com/explosion/spaCy/issues .. _StackOverflow: http://stackoverflow.com/questions/tagged/spacy -.. _Gitter chat: https://gitter.im/explosion/spaCy -.. _Reddit user group: https://www.reddit.com/r/spacynlp +.. _Gitter Chat: https://gitter.im/explosion/spaCy +.. _Reddit User Group: https://www.reddit.com/r/spacynlp Features ======== -* Non-destructive **tokenization** -* Syntax-driven sentence segmentation -* Pre-trained **word vectors** -* Part-of-speech tagging +* **Fastest syntactic parser** in the world * **Named entity** recognition -* Labelled dependency parsing -* Convenient string-to-int mapping -* Export to numpy data arrays -* GIL-free **multi-threading** -* Efficient binary serialization +* Non-destructive **tokenization** +* Support for **20+ languages** +* Pre-trained `statistical models `_ and word vectors * Easy **deep learning** integration -* Statistical models for **English**, **German**, **French** and **Spanish** +* Part-of-speech tagging +* Labelled dependency parsing +* Syntax-driven sentence segmentation +* Built in **visualizers** for syntax and NER +* Convenient string-to-hash mapping +* Export to numpy data arrays +* Efficient binary serialization +* Easy **model packaging** and deployment * State-of-the-art speed * Robust, rigorously evaluated accuracy -See `facts, figures and benchmarks `_. +📖 **For more details, see the** `facts, figures and benchmarks `_. -Top Performance ---------------- +Install spaCy +============= -* Fastest in the world: <50ms per document. No faster system has ever been - announced. -* Accuracy within 1% of the current state of the art on all tasks performed - (parsing, named entity recognition, part-of-speech tagging). The only more - accurate systems are an order of magnitude slower or more. - -Supports --------- +For detailed installation instructions, see +the `documentation `_. ==================== === **Operating system** macOS / OS X, Linux, Windows (Cygwin, MinGW, Visual Studio) @@ -110,12 +115,6 @@ Supports .. _pip: https://pypi.python.org/pypi/spacy .. _conda: https://anaconda.org/conda-forge/spacy -Install spaCy -============= - -Installation requires a working build environment. See notes on Ubuntu, -macOS/OS X and Windows for details. - pip --- @@ -123,7 +122,7 @@ Using pip, spaCy releases are currently only available as source packages. .. code:: bash - pip install -U spacy + pip install spacy When using pip it is generally recommended to install packages in a ``virtualenv`` to avoid modifying system state: @@ -149,25 +148,41 @@ For the feedstock including the build recipe and configuration, check out `this repository `_. Improvements and pull requests to the recipe and setup are always appreciated. +Updating spaCy +-------------- + +Some updates to spaCy may require downloading new statistical models. If you're +running spaCy v2.0 or higher, you can use the ``validate`` command to check if +your installed models are compatible and if not, print details on how to update +them: + +.. code:: bash + + pip install -U spacy + spacy validate + +If you've trained your own models, keep in mind that your training and runtime +inputs must match. After updating spaCy, we recommend **retraining your models** +with the new version. + +📖 **For details on upgrading from spaCy 1.x to spaCy 2.x, see the** +`migration guide `_. + Download models =============== As of v1.7.0, models for spaCy can be installed as **Python packages**. This means that they're a component of your application, just like any -other module. They're versioned and can be defined as a dependency in your -``requirements.txt``. Models can be installed from a download URL or -a local directory, manually or via pip. Their data can be located anywhere on -your file system. To make a model available to spaCy, all you need to do is -create a "shortcut link", an internal alias that tells spaCy where to find the -data files for a specific model name. +other module. Models can be installed using spaCy's ``download`` command, +or manually by pointing pip to a path or URL. ======================= === -`spaCy Models`_ Available models, latest releases and direct download. +`Available Models`_ Detailed model descriptions, accuracy figures and benchmarks. `Models Documentation`_ Detailed usage instructions. ======================= === -.. _spaCy Models: https://github.com/explosion/spacy-models/releases/ -.. _Models Documentation: https://spacy.io/docs/usage/models +.. _Available Models: https://alpha.spacy.io/models +.. _Models Documentation: https://alpha.spacy.io/docs/usage/models .. code:: bash @@ -175,17 +190,10 @@ data files for a specific model name. python -m spacy download en # download best-matching version of specific model for your spaCy installation - python -m spacy download en_core_web_md + python -m spacy download en_core_web_lg # pip install .tar.gz archive from path or URL - pip install /Users/you/en_core_web_md-1.2.0.tar.gz - pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-1.2.0/en_core_web_md-1.2.0.tar.gz - - # set up shortcut link to load installed package as "en_default" - python -m spacy link en_core_web_md en_default - - # set up shortcut link to load local model as "my_amazing_model" - python -m spacy link /Users/you/data my_amazing_model + pip install /Users/you/en_core_web_sm-2.0.0.tar.gz Loading and using models ------------------------ @@ -199,24 +207,24 @@ To load a model, use ``spacy.load()`` with the model's shortcut link: doc = nlp(u'This is a sentence.') If you've installed a model via pip, you can also ``import`` it directly and -then call its ``load()`` method with no arguments. This should also work for -older models in previous versions of spaCy. +then call its ``load()`` method: .. code:: python import spacy - import en_core_web_md + import en_core_web_sm - nlp = en_core_web_md.load() + nlp = en_core_web_.load() doc = nlp(u'This is a sentence.') -📖 **For more info and examples, check out the** `models documentation `_. +📖 **For more info and examples, check out the** +`models documentation `_. Support for older versions -------------------------- -If you're using an older version (v1.6.0 or below), you can still download and -install the old models from within spaCy using ``python -m spacy.en.download all`` +If you're using an older version (``v1.6.0`` or below), you can still download +and install the old models from within spaCy using ``python -m spacy.en.download all`` or ``python -m spacy.de.download all``. The ``.tar.gz`` archives are also `attached to the v1.6.0 release `_. To download and install the models manually, unpack the archive, drop the @@ -248,11 +256,13 @@ details. pip install -r requirements.txt pip install -e . -Compared to regular install via pip `requirements.txt `_ +Compared to regular install via pip, `requirements.txt `_ additionally installs developer dependencies such as Cython. - Instead of the above verbose commands, you can also use the following -`Fabric `_ commands: +`Fabric `_ commands. All commands assume that your +``virtualenv`` is located in a directory ``.env``. If you're using a different +directory, you can change it via the environment variable ``VENV_DIR``, for +example ``VENV_DIR=".custom-env" fab clean make``. ============= === ``fab env`` Create ``virtualenv`` and delete previous one, if it exists. @@ -261,14 +271,6 @@ Instead of the above verbose commands, you can also use the following ``fab test`` Run basic tests, aborting after first failure. ============= === -All commands assume that your ``virtualenv`` is located in a directory ``.env``. -If you're using a different directory, you can change it via the environment -variable ``VENV_DIR``, for example: - -.. code:: bash - - VENV_DIR=".custom-env" fab clean make - Ubuntu ------ @@ -310,76 +312,4 @@ and ``--model`` are optional and enable additional tests: # make sure you are using recent pytest version python -m pip install -U pytest - python -m pytest - -🛠 Changelog -============ - -=========== ============== =========== -Version Date Description -=========== ============== =========== -`v1.8.2`_ ``2017-04-26`` French model and small improvements -`v1.8.1`_ ``2017-04-23`` Saving, loading and training bug fixes -`v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading -`v1.7.5`_ ``2017-04-07`` Bug fixes and new CLI commands -`v1.7.3`_ ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes -`v1.7.2`_ ``2017-03-20`` Small fixes to beam parser and model linking -`v1.7.1`_ ``2017-03-19`` Fix data download for system installation -`v1.7.0`_ ``2017-03-18`` New 50 MB model, CLI, better downloads and lots of bug fixes -`v1.6.0`_ ``2017-01-16`` Improvements to tokenizer and tests -`v1.5.0`_ ``2016-12-27`` Alpha support for Swedish and Hungarian -`v1.4.0`_ ``2016-12-18`` Improved language data and alpha Dutch support -`v1.3.0`_ ``2016-12-03`` Improve API consistency -`v1.2.0`_ ``2016-11-04`` Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese -`v1.1.0`_ ``2016-10-23`` Bug fixes and adjustments -`v1.0.0`_ ``2016-10-18`` Support for deep learning workflows and entity-aware rule matcher -`v0.101.0`_ ``2016-05-10`` Fixed German model -`v0.100.7`_ ``2016-05-05`` German support -`v0.100.6`_ ``2016-03-08`` Add support for GloVe vectors -`v0.100.5`_ ``2016-02-07`` Fix incorrect use of header file -`v0.100.4`_ ``2016-02-07`` Fix OSX problem introduced in 0.100.3 -`v0.100.3`_ ``2016-02-06`` Multi-threading, faster loading and bugfixes -`v0.100.2`_ ``2016-01-21`` Fix data version lock -`v0.100.1`_ ``2016-01-21`` Fix install for OSX -`v0.100`_ ``2016-01-19`` Revise setup.py, better model downloads, bug fixes -`v0.99`_ ``2015-11-08`` Improve span merging, internal refactoring -`v0.98`_ ``2015-11-03`` Smaller package, bug fixes -`v0.97`_ ``2015-10-23`` Load the StringStore from a json list, instead of a text file -`v0.96`_ ``2015-10-19`` Hotfix to .merge method -`v0.95`_ ``2015-10-18`` Bug fixes -`v0.94`_ ``2015-10-09`` Fix memory and parse errors -`v0.93`_ ``2015-09-22`` Bug fixes to word vectors -=========== ============== =========== - -.. _v1.8.2: https://github.com/explosion/spaCy/releases/tag/v1.8.2 -.. _v1.8.1: https://github.com/explosion/spaCy/releases/tag/v1.8.1 -.. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 -.. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5 -.. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3 -.. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2 -.. _v1.7.1: https://github.com/explosion/spaCy/releases/tag/v1.7.1 -.. _v1.7.0: https://github.com/explosion/spaCy/releases/tag/v1.7.0 -.. _v1.6.0: https://github.com/explosion/spaCy/releases/tag/v1.6.0 -.. _v1.5.0: https://github.com/explosion/spaCy/releases/tag/v1.5.0 -.. _v1.4.0: https://github.com/explosion/spaCy/releases/tag/v1.4.0 -.. _v1.3.0: https://github.com/explosion/spaCy/releases/tag/v1.3.0 -.. _v1.2.0: https://github.com/explosion/spaCy/releases/tag/v1.2.0 -.. _v1.1.0: https://github.com/explosion/spaCy/releases/tag/v1.1.0 -.. _v1.0.0: https://github.com/explosion/spaCy/releases/tag/v1.0.0 -.. _v0.101.0: https://github.com/explosion/spaCy/releases/tag/0.101.0 -.. _v0.100.7: https://github.com/explosion/spaCy/releases/tag/0.100.7 -.. _v0.100.6: https://github.com/explosion/spaCy/releases/tag/0.100.6 -.. _v0.100.5: https://github.com/explosion/spaCy/releases/tag/0.100.5 -.. _v0.100.4: https://github.com/explosion/spaCy/releases/tag/0.100.4 -.. _v0.100.3: https://github.com/explosion/spaCy/releases/tag/0.100.3 -.. _v0.100.2: https://github.com/explosion/spaCy/releases/tag/0.100.2 -.. _v0.100.1: https://github.com/explosion/spaCy/releases/tag/0.100.1 -.. _v0.100: https://github.com/explosion/spaCy/releases/tag/0.100 -.. _v0.99: https://github.com/explosion/spaCy/releases/tag/0.99 -.. _v0.98: https://github.com/explosion/spaCy/releases/tag/0.98 -.. _v0.97: https://github.com/explosion/spaCy/releases/tag/0.97 -.. _v0.96: https://github.com/explosion/spaCy/releases/tag/0.96 -.. _v0.95: https://github.com/explosion/spaCy/releases/tag/0.95 -.. _v0.94: https://github.com/explosion/spaCy/releases/tag/0.94 -.. _v0.93: https://github.com/explosion/spaCy/releases/tag/0.93 From 1730648e195a854fc44d1970737cb128e874d0d5 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 21:49:04 +0200 Subject: [PATCH 13/99] Update pull request template --- .github/PULL_REQUEST_TEMPLATE.md | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e97a7ea16..ec11b78bd 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,20 +1,19 @@ - + ## Description - - + +### Types of change + -## Types of changes - -- [ ] **Bug fix** (non-breaking change fixing an issue) -- [ ] **New feature** (non-breaking change adding functionality to spaCy) -- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality) -- [ ] **Documentation** (addition to documentation of spaCy) - -## Checklist: - -- [ ] My change requires a change to spaCy's documentation. -- [ ] I have updated the documentation accordingly. -- [ ] I have added tests to cover my changes. -- [ ] All new and existing tests passed. +## Checklist + +- [ ] I have submitted the spaCy Contributor Agreement. +- [ ] I ran the tests, and all new and existing tests passed. +- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. From 4a06eddb5fdc067bf02cca3b9567759372de4885 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 22:18:40 +0200 Subject: [PATCH 14/99] Update README --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 27fca3fc2..9cffd2cae 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,9 @@ spaCy: Industrial-strength NLP ****************************** -spaCy is a library for advanced Natural Language Processing in Python and -Cython. spaCy is built on the very latest research, but it isn't researchware. -It was designed from day one to be used in real products. spaCy comes with +spaCy is a library for advanced Natural Language Processing in Python and Cython. +It's built on the very latest research, and was designed from day one to be +used in real products. spaCy comes with `pre-trained statistical models `_ and word vectors, and currently supports tokenization for **20+ languages**. It features the **fastest syntactic parser** in the world, convolutional **neural network models** From 3484174e487c3ec6171042d06e6a994a8330c61c Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 11:57:43 +0200 Subject: [PATCH 15/99] Add Language.path --- spacy/language.py | 6 ++++++ website/api/language.jade | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index c706e532a..933ca772d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -127,6 +127,7 @@ class Language(object): RETURNS (Language): The newly constructed object. """ self._meta = dict(meta) + self._path = None if vocab is True: factory = self.Defaults.create_vocab vocab = factory(self, **meta.get('vocab', {})) @@ -142,6 +143,10 @@ class Language(object): bytes_data = self.to_bytes(vocab=False) return (unpickle_language, (self.vocab, self.meta, bytes_data)) + @property + def path(self): + return self._path + @property def meta(self): self._meta.setdefault('lang', self.vocab.lang) @@ -611,6 +616,7 @@ class Language(object): if not (path / 'vocab').exists(): exclude['vocab'] = True util.from_disk(path, deserializers, exclude) + self._path = path return self def to_bytes(self, disable=[], **exclude): diff --git a/website/api/language.jade b/website/api/language.jade index 668cbadd7..6aa2d7612 100644 --- a/website/api/language.jade +++ b/website/api/language.jade @@ -609,6 +609,14 @@ p Load state from a binary string. | Custom meta data for the Language class. If a model is loaded, | contains meta data of the model. + +row + +cell #[code path] + +tag-new(2) + +cell #[code Path] + +cell + | Path to the model data directory, if a model is loaded. Otherwise + | #[code None]. + +h(2, "class-attributes") Class attributes +table(["Name", "Type", "Description"]) From 0b1dcbac1488e62379c2da326d666b39221e84e9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 12:08:46 +0200 Subject: [PATCH 16/99] Remove unused function --- spacy/_ml.py | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index b07e179f0..8a8d355d9 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -482,46 +482,6 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): return vectors, backward -def fine_tune(embedding, combine=None): - if combine is not None: - raise NotImplementedError( - "fine_tune currently only supports addition. Set combine=None") - def fine_tune_fwd(docs_tokvecs, drop=0.): - docs, tokvecs = docs_tokvecs - - lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') - - vecs, bp_vecs = embedding.begin_update(docs, drop=drop) - flat_tokvecs = embedding.ops.flatten(tokvecs) - flat_vecs = embedding.ops.flatten(vecs) - output = embedding.ops.unflatten( - (model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths) - - def fine_tune_bwd(d_output, sgd=None): - flat_grad = model.ops.flatten(d_output) - model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum() - model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum() - - bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd) - if sgd is not None: - sgd(model._mem.weights, model._mem.gradient, key=model.id) - return [d_o * model.mix[0] for d_o in d_output] - return output, fine_tune_bwd - - def fine_tune_predict(docs_tokvecs): - docs, tokvecs = docs_tokvecs - vecs = embedding(docs) - return [model.mix[0]*tv+model.mix[1]*v - for tv, v in zip(tokvecs, vecs)] - - model = wrap(fine_tune_fwd, embedding) - model.mix = model._mem.add((model.id, 'mix'), (2,)) - model.mix.fill(0.5) - model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) - model.predict = fine_tune_predict - return model - - @layerize def flatten(seqs, drop=0.): if isinstance(seqs[0], numpy.ndarray): From 7bcec574620b611882e74d2356f6ffdead628ae3 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 12:08:54 +0200 Subject: [PATCH 17/99] Remove unused attribute --- spacy/matcher.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index a0c69f4bf..2c001c652 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -198,7 +198,6 @@ cdef class Matcher: cdef public object _patterns cdef public object _entities cdef public object _callbacks - cdef public object _acceptors def __init__(self, vocab): """Create the Matcher. @@ -209,7 +208,6 @@ cdef class Matcher: """ self._patterns = {} self._entities = {} - self._acceptors = {} self._callbacks = {} self.vocab = vocab self.mem = Pool() From 7eebeeaf85d1637af744aa2b504ffa2d2df42ed6 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 12:09:47 +0200 Subject: [PATCH 18/99] Fix Matcher.__contains__ --- spacy/matcher.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 2c001c652..ea5b7e416 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -230,7 +230,7 @@ cdef class Matcher: key (unicode): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ - return len(self._patterns) + return key in self._patterns def add(self, key, on_match, *patterns): """Add a match-rule to the matcher. A match-rule consists of: an ID key, From 9c733a884922a447ae620ab41d97c086d429c8a4 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 12:09:56 +0200 Subject: [PATCH 19/99] Implement PhraseMatcher.__len__ --- spacy/matcher.pyx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index ea5b7e416..be9634fc9 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -471,7 +471,13 @@ cdef class PhraseMatcher: self._callbacks = {} def __len__(self): - raise NotImplementedError + """Get the number of rules added to the matcher. Note that this only + returns the number of rules (identical with the number of IDs), not the + number of individual patterns. + + RETURNS (int): The number of rules. + """ + return len(self.phrase_ids) def __contains__(self, key): raise NotImplementedError From 1262aa0bf9e954b9193781661f29652a97222b56 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 12:10:04 +0200 Subject: [PATCH 20/99] Implement PhraseMatcher.__contains__ --- spacy/matcher.pyx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index be9634fc9..8b815194c 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -480,7 +480,13 @@ cdef class PhraseMatcher: return len(self.phrase_ids) def __contains__(self, key): - raise NotImplementedError + """Check whether the matcher contains rules for a match ID. + + key (unicode): The match ID. + RETURNS (bool): Whether the matcher contains rules for this match ID. + """ + cdef hash_t ent_id = self.matcher._normalize_key(key) + return ent_id in self.phrase_ids def __reduce__(self): return (self.__class__, (self.vocab,), None, None) From 4d97efc3b5f1d51fa4ff9d2a350787298f77ab04 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 12:10:16 +0200 Subject: [PATCH 21/99] Add missing docstrings --- spacy/matcher.pyx | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 8b815194c..6c1069578 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -255,6 +255,10 @@ cdef class Matcher: and '*' patterns in a row and their matches overlap, the first operator will behave non-greedily. This quirk in the semantics makes the matcher more efficient, by avoiding the need for back-tracking. + + key (unicode): The match ID. + on_match (callable): Callback executed on match. + *patterns (list): List of token descritions. """ for pattern in patterns: if len(pattern) == 0: @@ -492,6 +496,13 @@ cdef class PhraseMatcher: return (self.__class__, (self.vocab,), None, None) def add(self, key, on_match, *docs): + """Add a match-rule to the matcher. A match-rule consists of: an ID key, + an on_match callback, and one or more patterns. + + key (unicode): The match ID. + on_match (callable): Callback executed on match. + *docs (Doc): `Doc` objects representing match patterns. + """ cdef Doc doc for doc in docs: if len(doc) >= self.max_length: @@ -520,6 +531,13 @@ cdef class PhraseMatcher: self.phrase_ids.set(phrase_hash, ent_id) def __call__(self, Doc doc): + """Find all sequences matching the supplied patterns on the `Doc`. + + doc (Doc): The document to match over. + RETURNS (list): A list of `(key, start, end)` tuples, + describing the matches. A match tuple describes a span + `doc[start:end]`. The `label_id` and `key` are both integers. + """ matches = [] for _, start, end in self.matcher(doc): ent_id = self.accept_match(doc, start, end) @@ -532,6 +550,14 @@ cdef class PhraseMatcher: return matches def pipe(self, stream, batch_size=1000, n_threads=2): + """Match a stream of documents, yielding them in turn. + + docs (iterable): A stream of documents. + batch_size (int): The number of documents to accumulate into a working set. + n_threads (int): The number of threads with which to work on the buffer + in parallel, if the `Matcher` implementation supports multi-threading. + YIELDS (Doc): Documents, in order. + """ for doc in stream: self(doc) yield doc From 72497c8cb2ed59ac1f0b9fd0c9f1b0f6a6d1f51e Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 12:15:43 +0200 Subject: [PATCH 22/99] Remove comments and add TODO --- spacy/tokenizer.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index bc09129de..e865c60dd 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -63,11 +63,8 @@ cdef class Tokenizer: return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): + # TODO: deprecation warning return Doc(self.vocab, words=strings) - #raise NotImplementedError( - # "Method deprecated in 1.0.\n" - # "Old: tokenizer.tokens_from_list(strings)\n" - # "New: Doc(tokenizer.vocab, words=strings)") @cython.boundscheck(False) def __call__(self, unicode string): From e70f80f29ed9c3acd92ac005af54a967ce32a3fb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 25 Oct 2017 13:46:41 +0200 Subject: [PATCH 23/99] Add Language.disable_pipes() --- spacy/language.py | 60 +++++++++++++++++++++++ spacy/tests/pipeline/test_pipe_methods.py | 18 +++++++ 2 files changed, 78 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index c706e532a..ddc089bd3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import absolute_import, unicode_literals from contextlib import contextmanager +import copy from thinc.neural import Model from thinc.neural.optimizers import Adam @@ -329,6 +330,29 @@ class Language(object): doc = proc(doc) return doc + def disable_pipes(self, *names): + '''Disable one or more pipeline components. + + If used as a context manager, the pipeline will be restored to the initial + state at the end of the block. Otherwise, a DisabledPipes object is + returned, that has a `.restore()` method you can use to undo your + changes. + + EXAMPLE: + + >>> nlp.add_pipe('parser') + >>> nlp.add_pipe('tagger') + >>> with nlp.disable_pipes('parser', 'tagger'): + >>> assert not nlp.has_pipe('parser') + >>> assert nlp.has_pipe('parser') + >>> disabled = nlp.disable_pipes('parser') + >>> assert len(disabled) == 1 + >>> assert not nlp.has_pipe('parser') + >>> disabled.restore() + >>> assert nlp.has_pipe('parser') + ''' + return DisabledPipes(self, *names) + def make_doc(self, text): return self.tokenizer(text) @@ -655,6 +679,42 @@ class Language(object): return self +class DisabledPipes(list): + '''Manager for temporary pipeline disabling.''' + def __init__(self, nlp, *names): + self.nlp = nlp + self.names = names + # Important! Not deep copy -- we just want the container (but we also + # want to support people providing arbitrarily typed nlp.pipeline + # objects.) + self.original_pipeline = copy.copy(nlp.pipeline) + list.__init__(self) + self.extend(nlp.remove_pipe(name) for name in names) + + def __enter__(self): + pass + + def __exit__(self, *args): + self.restore() + + def restore(self): + '''Restore the pipeline to its state when DisabledPipes was created.''' + current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline + unexpected = [name for name in current if not self.nlp.has_pipe(name)] + if unexpected: + # Don't change the pipeline if we're raising an error. + self.nlp.pipeline = current + msg = ( + "Some current components would be lost when restoring " + "previous pipeline state. If you added components after " + "calling nlp.disable_pipes(), you should remove them " + "explicitly with nlp.remove_pipe() before the pipeline is " + "restore. Names of the new components: %s" + ) + raise ValueError(msg % unexpected) + self[:] = [] + + def unpickle_language(vocab, meta, bytes_data): lang = Language(vocab=vocab) lang.from_bytes(bytes_data) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 5ec78aefb..dbcde3e5e 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -82,3 +82,21 @@ def test_remove_pipe(nlp, name): assert not len(nlp.pipeline) assert removed_name == name assert removed_component == new_pipe + + +@pytest.mark.parametrize('name', ['my_component']) +def test_disable_pipes_method(nlp, name): + nlp.add_pipe(new_pipe, name=name) + assert nlp.has_pipe(name) + disabled = nlp.disable_pipes(name) + assert not nlp.has_pipe(name) + disabled.restore() + + +@pytest.mark.parametrize('name', ['my_component']) +def test_disable_pipes_context(nlp, name): + nlp.add_pipe(new_pipe, name=name) + assert nlp.has_pipe(name) + with nlp.disable_pipes(name): + assert not nlp.has_pipe(name) + assert nlp.has_pipe(name) From 68e9de691728f3853218ee6871902f79f6cd4ae9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 13:57:14 +0200 Subject: [PATCH 24/99] Add documentation --- website/api/language.jade | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/website/api/language.jade b/website/api/language.jade index 668cbadd7..52950987a 100644 --- a/website/api/language.jade +++ b/website/api/language.jade @@ -440,6 +440,37 @@ p +cell tuple +cell A #[code (name, component)] tuple of the removed component. ++h(2, "disable_pipes") Language.disable_pipes + +tag contextmanager + +tag-new(2) + +p + | Disable one or more pipeline components. If used as a context manager, + | the pipeline will be restored to the initial state at the end of the + | block. Otherwise, a #[code DisabledPipes] object is returned, that has a + | #[code .restore()] method you can use to undo your changes. + ++aside-code("Example"). + with nlp.disable_pipes('tagger', 'parser'): + optimizer = nlp.begin_training(gold_tuples) + + disabled = nlp.disable_pipes('tagger', 'parser') + optimizer = nlp.begin_training(gold_tuples) + disabled.restore() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code *disabled] + +cell unicode + +cell Names of pipeline components to disable. + + +row("foot") + +cell returns + +cell #[code DisabledPipes] + +cell + | The disabled pipes that can be restored by calling the object's + | #[code .restore()] method. + +h(2, "to_disk") Language.to_disk +tag method +tag-new(2) From 0102561f34033163dd8b7f711e98f33687233ac8 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 13:57:55 +0200 Subject: [PATCH 25/99] Update docs --- website/api/language.jade | 1 + website/usage/v2.jade | 1 + 2 files changed, 2 insertions(+) diff --git a/website/api/language.jade b/website/api/language.jade index 6aa2d7612..b8fe98d78 100644 --- a/website/api/language.jade +++ b/website/api/language.jade @@ -229,6 +229,7 @@ p +cell Config parameters. +h(2, "preprocess_gold") Language.preprocess_gold + +tag method p | Can be called before training to pre-process gold data. By default, it diff --git a/website/usage/v2.jade b/website/usage/v2.jade index bb150de86..f833468bf 100644 --- a/website/usage/v2.jade +++ b/website/usage/v2.jade @@ -497,6 +497,7 @@ p +code-new. nlp = spacy.load('en', disable=['tagger', 'ner']) + doc = nlp(u"I don't want parsed", disable['parser']) nlp.remove_pipe('parser') +code-old. nlp = spacy.load('en', tagger=False, entity=False) From 094512fd47a67501d911066035289a10454c873c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 25 Oct 2017 14:44:00 +0200 Subject: [PATCH 26/99] Fix model-mark on regression test. --- spacy/tests/regression/test_issue1305.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue1305.py b/spacy/tests/regression/test_issue1305.py index d1d5eb93d..342cdd081 100644 --- a/spacy/tests/regression/test_issue1305.py +++ b/spacy/tests/regression/test_issue1305.py @@ -1,11 +1,10 @@ import pytest import spacy -#@pytest.mark.models('en') +@pytest.mark.models('en') def test_issue1305(): '''Test lemmatization of English VBZ''' nlp = spacy.load('en_core_web_sm') assert nlp.vocab.morphology.lemmatizer('works', 'verb') == ['work'] doc = nlp(u'This app works well') - print([(w.text, w.tag_) for w in doc]) assert doc[2].lemma_ == 'work' From 7f03932477f92cb5a3b5ae0379f3ee7499a340b0 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 14:56:16 +0200 Subject: [PATCH 27/99] Return self on __enter__ --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index ddc089bd3..5a85a83ec 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -692,7 +692,7 @@ class DisabledPipes(list): self.extend(nlp.remove_pipe(name) for name in names) def __enter__(self): - pass + return self def __exit__(self, *args): self.restore() From 6a00de4f77f1391744f914ebe8f957e1da43a73e Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 14:56:35 +0200 Subject: [PATCH 28/99] Fix check of unexpected pipe names in restore() --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 5a85a83ec..05dc32783 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -700,7 +700,7 @@ class DisabledPipes(list): def restore(self): '''Restore the pipeline to its state when DisabledPipes was created.''' current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline - unexpected = [name for name in current if not self.nlp.has_pipe(name)] + unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)] if unexpected: # Don't change the pipeline if we're raising an error. self.nlp.pipeline = current From 615c315d709035ea159f3fd3e49dd3cde594bff2 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 14:56:53 +0200 Subject: [PATCH 29/99] Update train_new_entity_type example to use disable_pipes --- examples/training/train_new_entity_type.py | 174 ++++++++++++--------- 1 file changed, 96 insertions(+), 78 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 5f10beebc..fc550b1ed 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -21,103 +21,121 @@ After training your model, you can save it to a directory. We recommend wrapping models as Python packages, for ease of deployment. For more details, see the documentation: -* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner -* Saving and loading models: https://spacy.io/docs/usage/saving-loading +* Training: https://alpha.spacy.io/usage/training +* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities -Developed for: spaCy 1.7.6 -Last updated for: spaCy 2.0.0a13 +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 """ from __future__ import unicode_literals, print_function import random from pathlib import Path -import random import spacy from spacy.gold import GoldParse, minibatch from spacy.pipeline import NeuralEntityRecognizer -from spacy.pipeline import TokenVectorEncoder + + +# new entity label +LABEL = 'ANIMAL' + +# training data +TRAIN_DATA = [ + ("Horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + + ("Do they bite?", []), + + ("horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + + ("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]), + + ("they pretend to care about your feelings, those horses", + [(48, 54, 'ANIMAL')]), + + ("horses?", [(0, 6, 'ANIMAL')]) +] + + +def main(model=None, new_model_name='animal', output_dir=None): + """Set up the pipeline and entity recognizer, and train the new entity. + + model (unicode): Model name to start off with. If None, a blank English + Language class is created. + new_model_name (unicode): Name of new model to create. Will be added to the + model meta and prefixed by the language code, e.g. 'en_animal'. + output_dir (unicode / Path): Optional output directory. If None, no model + will be saved. + """ + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") + + # Add entity recognizer to model if it's not in the pipeline + if 'ner' not in nlp.pipe_names: + nlp.add_pipe(NeuralEntityRecognizer(nlp.vocab)) + + ner = nlp.get_pipe('ner') # get entity recognizer + ner.add_label(LABEL) # add new entity label to entity recognizer + + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] + with nlp.disable_pipes(*other_pipes) as disabled: # only train NER + random.seed(0) + optimizer = nlp.begin_training(lambda: []) + for itn in range(50): + losses = {} + gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA) + for batch in minibatch(gold_parses, size=3): + docs, golds = zip(*batch) + nlp.update(docs, golds, losses=losses, sgd=optimizer, + drop=0.35) + print(losses) + print(nlp.pipeline) + print(disabled.original_pipeline) + + # test the trained model + test_text = 'Do you like horses?' + doc = nlp(test_text) + print("Entities in '%s'" % test_text) + for ent in doc.ents: + print(ent.label_, ent.text) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.meta['name'] = new_model_name # rename model + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc2 = nlp2(test_text) + for ent in doc2.ents: + print(ent.label_, ent.text) def get_gold_parses(tokenizer, train_data): - '''Shuffle and create GoldParse objects''' + """Shuffle and create GoldParse objects. + + tokenizer (Tokenizer): Tokenizer to processs the raw text. + train_data (list): The training data. + YIELDS (tuple): (doc, gold) tuples. + """ random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = tokenizer(raw_text) gold = GoldParse(doc, entities=entity_offsets) yield doc, gold - -def train_ner(nlp, train_data, output_dir): - random.seed(0) - optimizer = nlp.begin_training(lambda: []) - nlp.meta['name'] = 'en_ent_animal' - for itn in range(50): - losses = {} - for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3): - docs, golds = zip(*batch) - nlp.update(docs, golds, losses=losses, sgd=optimizer, drop=0.35) - print(losses) - if not output_dir: - return - elif not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - - -def main(model_name, output_directory=None): - print("Creating initial model", model_name) - nlp = spacy.blank(model_name) - if output_directory is not None: - output_directory = Path(output_directory) - - train_data = [ - ( - "Horses are too tall and they pretend to care about your feelings", - [(0, 6, 'ANIMAL')], - ), - ( - "Do they bite?", - [], - ), - - ( - "horses are too tall and they pretend to care about your feelings", - [(0, 6, 'ANIMAL')] - ), - ( - "horses pretend to care about your feelings", - [(0, 6, 'ANIMAL')] - ), - ( - "they pretend to care about your feelings, those horses", - [(48, 54, 'ANIMAL')] - ), - ( - "horses?", - [(0, 6, 'ANIMAL')] - ) - - ] - nlp.add_pipe(TokenVectorEncoder(nlp.vocab)) - ner = NeuralEntityRecognizer(nlp.vocab) - ner.add_label('ANIMAL') - nlp.add_pipe(ner) - train_ner(nlp, train_data, output_directory) - - # Test that the entity is recognized - text = 'Do you like horses?' - print("Ents in 'Do you like horses?':") - doc = nlp(text) - for ent in doc.ents: - print(ent.label_, ent.text) - if output_directory: - print("Loading from", output_directory) - nlp2 = spacy.load(output_directory) - doc2 = nlp2('Do you like horses?') - for ent in doc2.ents: - print(ent.label_, ent.text) - if __name__ == '__main__': import plac From 5117a7d24d0ca15f6fc04be13fa4a30527971ef8 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 15:54:02 +0200 Subject: [PATCH 30/99] Fix whitespace --- spacy/syntax/nn_parser.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index a9553fd1f..f93f44d9d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -253,7 +253,7 @@ cdef class Parser: hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) if hist_size != 0: raise ValueError("Currently history size is hard-coded to 0") - if hist_width != 0: + if hist_width != 0: raise ValueError("Currently history width is hard-coded to 0") tok2vec = Tok2Vec(token_vector_width, embed_size, pretrained_dims=cfg.get('pretrained_dims', 0)) @@ -413,7 +413,7 @@ cdef class Parser: for stcls in state_objs: if not stcls.c.is_final(): states.push_back(stcls.c) - + feat_weights = state2vec.get_feat_weights() cdef int i cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T) @@ -432,7 +432,7 @@ cdef class Parser: PyErr_CheckSignals() return state_objs - cdef void _parseC(self, StateC* state, + cdef void _parseC(self, StateC* state, const float* feat_weights, const float* hW, const float* hb, int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil: token_ids = calloc(nr_feat, sizeof(int)) @@ -443,7 +443,7 @@ cdef class Parser: with gil: PyErr_SetFromErrno(MemoryError) PyErr_CheckSignals() - + while not state.is_final(): state.set_context_tokens(token_ids, nr_feat) memset(vectors, 0, nr_hidden * nr_piece * sizeof(float)) From 18aae423fbc09ca0507c6cabbe650143ae9b30bf Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 15:54:10 +0200 Subject: [PATCH 31/99] Remove import of non-existing function --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index f93f44d9d..913d2365f 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -48,7 +48,7 @@ from thinc.neural.util import get_array_module from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts -from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune +from .._ml import Tok2Vec, doc2feats, rebatch from .._ml import Residual, drop_layer, flatten from .._ml import link_vectors_to_models from .._ml import HistoryFeatures From 273e6381839d810a81b281d2c4315d132e5f2bfb Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 16:03:05 +0200 Subject: [PATCH 32/99] Add vector data to model meta after training (see #1457) --- spacy/cli/train.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 2faea72e7..026b1fe44 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -144,7 +144,10 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, file_.write(json_dumps(scorer.scores)) meta_loc = output_path / ('model%d' % i) / 'meta.json' meta['accuracy'] = scorer.scores - meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps} + meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps, + 'gpu': gpu_wps} + meta['vectors'] = {'entries': nlp.vocab.vectors_length, + 'width': 0} meta['lang'] = nlp.lang meta['pipeline'] = pipeline meta['spacy_version'] = '>=%s' % about.__version__ From 057954695bc7baf88d301a7e756668b13757b6fe Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 16:03:26 +0200 Subject: [PATCH 33/99] Read pipeline and vector data off model in --generate-meta --- spacy/cli/package.py | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 83d4917f6..6b0811459 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -43,7 +43,7 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force prints(meta_path, title="Reading meta.json from file") meta = util.read_json(meta_path) else: - meta = generate_meta() + meta = generate_meta(input_dir) meta = validate_meta(meta, ['lang', 'name', 'version']) model_name = meta['lang'] + '_' + meta['name'] @@ -77,7 +77,8 @@ def create_file(file_path, contents): file_path.open('w', encoding='utf-8').write(contents) -def generate_meta(): +def generate_meta(model_path): + meta = {} settings = [('lang', 'Model language', 'en'), ('name', 'Model name', 'model'), ('version', 'Model version', '0.0.0'), @@ -87,31 +88,21 @@ def generate_meta(): ('email', 'Author email', False), ('url', 'Author website', False), ('license', 'License', 'CC BY-NC 3.0')] - prints("Enter the package settings for your model.", title="Generating meta.json") - meta = {} + nlp = util.load_model_from_path(Path(model_path)) + meta['pipeline'] = nlp.pipe_names + meta['vectors'] = {'width': nlp.vocab.vectors_length, + 'entries': len(nlp.vocab.vectors)} + prints("Enter the package settings for your model. The following " + "information will be read from your model data: pipeline, vectors.", + title="Generating meta.json") for setting, desc, default in settings: response = util.get_raw_input(desc, default) meta[setting] = default if response == '' and default else response - meta['pipeline'] = generate_pipeline() if about.__title__ != 'spacy': meta['parent_package'] = about.__title__ return meta -def generate_pipeline(): - prints("If set to 'True', the default pipeline is used. If set to 'False', " - "the pipeline will be disabled. Components should be specified as a " - "comma-separated list of component names, e.g. tagger, " - "parser, ner. For more information, see the docs on processing pipelines.", - title="Enter your model's pipeline components") - pipeline = util.get_raw_input("Pipeline components", True) - subs = {'True': True, 'False': False} - if pipeline in subs: - return subs[pipeline] - else: - return [p.strip() for p in pipeline.split(',')] - - def validate_meta(meta, keys): for key in keys: if key not in meta or meta[key] == '': From 11e3f19764e5958247edcef4eb00110ef9a7fb8f Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 16:08:26 +0200 Subject: [PATCH 34/99] Fix vectors data added after training (see #1457) --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 026b1fe44..da398751c 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -146,8 +146,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, meta['accuracy'] = scorer.scores meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps, 'gpu': gpu_wps} - meta['vectors'] = {'entries': nlp.vocab.vectors_length, - 'width': 0} + meta['vectors'] = {'width': nlp.vocab.vectors_length, + 'entries': len(nlp.vocab.vectors)} meta['lang'] = nlp.lang meta['pipeline'] = pipeline meta['spacy_version'] = '>=%s' % about.__version__ From 70de2dd0359169bc86ccd397446d0acd6d47f9d6 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 16:15:37 +0200 Subject: [PATCH 35/99] Display vectors in models directory if available (see #1457) --- website/_includes/_page_models.jade | 2 +- website/assets/js/main.js | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade index c5bd799f0..d4ce55f43 100644 --- a/website/_includes/_page_models.jade +++ b/website/_includes/_page_models.jade @@ -38,7 +38,7 @@ for id in CURRENT_MODELS +cell #[+label Size] +cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]] - each label in ["Pipeline", "Sources", "Author", "License"] + each label in ["Pipeline", "Vectors", "Sources", "Author", "License"] - var field = label.toLowerCase() +row +cell.u-nowrap diff --git a/website/assets/js/main.js b/website/assets/js/main.js index 42199538f..5cbd4d807 100644 --- a/website/assets/js/main.js +++ b/website/assets/js/main.js @@ -140,6 +140,10 @@ class ModelLoader { else return ({ ok: res.ok }) } + convertNumber(num, separator = ',') { + return num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, separator); + } + getModels(compat) { this.compat = compat; for (let modelId of this.modelIds) { @@ -159,7 +163,7 @@ class ModelLoader { const template = new Templater(modelId); template.get('table').removeAttribute('data-loading'); template.get('error').style.display = 'block'; - for (let key of ['sources', 'pipeline', 'author', 'license']) { + for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) { template.get(key).parentElement.parentElement.style.display = 'none'; } } @@ -167,13 +171,14 @@ class ModelLoader { /** * Update model details in tables. Currently quite hacky :( */ - render({ lang, name, version, sources, pipeline, url, author, license, accuracy, size, description, notes }) { + render({ lang, name, version, sources, pipeline, vectors, url, author, license, accuracy, size, description, notes }) { const modelId = `${lang}_${name}`; const model = `${modelId}-${version}`; const template = new Templater(modelId); const getSources = s => (s instanceof Array) ? s.join(', ') : s; const getPipeline = p => p.map(comp => `${comp}`).join(', '); + const getVectors = v => `${this.convertNumber(v.entries)} (${v.width} dimensions)`; const getLink = (t, l) => `${t}`; const keys = { version, size, description, notes } @@ -182,6 +187,8 @@ class ModelLoader { if (sources) template.fill('sources', getSources(sources)); if (pipeline && pipeline.length) template.fill('pipeline', getPipeline(pipeline), true); else template.get('pipeline').parentElement.parentElement.style.display = 'none'; + if (vectors) template.fill('vectors', getVectors(vectors)); + else template.get('vectors').parentElement.parentElement.style.display = 'none'; if (author) template.fill('author', url ? getLink(author, url) : author, true); if (license) template.fill('license', this.licenses[license] ? getLink(license, this.licenses[license]) : license, true); From 91beacf5e327a5898935050ff8fdb9b9d9268821 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 16:19:38 +0200 Subject: [PATCH 36/99] Fix Matcher.__contains__ --- spacy/matcher.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 6c1069578..fd4a8026a 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -230,7 +230,7 @@ cdef class Matcher: key (unicode): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ - return key in self._patterns + return self._normalize_key(key) in self._patterns def add(self, key, on_match, *patterns): """Add a match-rule to the matcher. A match-rule consists of: an ID key, From c0b55ebdac8196f4432a381a1ad39d7746d19ded Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 16:31:11 +0200 Subject: [PATCH 37/99] Fix PhraseMatcher.__contains__ and add more tests --- spacy/matcher.pyx | 2 +- spacy/tests/test_matcher.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index fd4a8026a..401405c14 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -490,7 +490,7 @@ cdef class PhraseMatcher: RETURNS (bool): Whether the matcher contains rules for this match ID. """ cdef hash_t ent_id = self.matcher._normalize_key(key) - return ent_id in self.phrase_ids + return ent_id in self._callbacks def __reduce__(self): return (self.__class__, (self.vocab,), None, None) diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 5b08ede39..8210467ea 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -64,6 +64,12 @@ def test_matcher_init(en_vocab, words): assert matcher(doc) == [] +def test_matcher_contains(matcher): + matcher.add('TEST', None, [{'ORTH': 'test'}]) + assert 'TEST' in matcher + assert 'TEST2' not in matcher + + def test_matcher_no_match(matcher): words = ["I", "like", "cheese", "."] doc = get_doc(matcher.vocab, words) @@ -112,7 +118,8 @@ def test_matcher_empty_dict(en_vocab): matcher.add('A.', None, [{'ORTH': 'a'}, {}]) matches = matcher(doc) assert matches[0][1:] == (0, 2) - + + def test_matcher_operator_shadow(en_vocab): matcher = Matcher(en_vocab) abc = ["a", "b", "c"] @@ -123,7 +130,8 @@ def test_matcher_operator_shadow(en_vocab): matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3) - + + def test_matcher_phrase_matcher(en_vocab): words = ["Google", "Now"] doc = get_doc(en_vocab, words) @@ -134,6 +142,22 @@ def test_matcher_phrase_matcher(en_vocab): assert len(matcher(doc)) == 1 +def test_phrase_matcher_length(en_vocab): + matcher = PhraseMatcher(en_vocab) + assert len(matcher) == 0 + matcher.add('TEST', None, get_doc(en_vocab, ['test'])) + assert len(matcher) == 1 + matcher.add('TEST2', None, get_doc(en_vocab, ['test2'])) + assert len(matcher) == 2 + + +def test_phrase_matcher_contains(en_vocab): + matcher = PhraseMatcher(en_vocab) + matcher.add('TEST', None, get_doc(en_vocab, ['test'])) + assert 'TEST' in matcher + assert 'TEST2' not in matcher + + def test_matcher_match_zero(matcher): words1 = 'He said , " some words " ...'.split() words2 = 'He said , " some three words " ...'.split() From 1bc07758faaf73a9cbcdca340b6343cb5d6cd76a Mon Sep 17 00:00:00 2001 From: mayukh18 Date: Wed, 25 Oct 2017 22:24:40 +0530 Subject: [PATCH 38/99] added few bengali pronouns --- spacy/lang/bn/morph_rules.py | 15 ++++++++++++++- spacy/lang/bn/stop_words.py | 4 ++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/spacy/lang/bn/morph_rules.py b/spacy/lang/bn/morph_rules.py index 8561f8676..6ca8fc097 100644 --- a/spacy/lang/bn/morph_rules.py +++ b/spacy/lang/bn/morph_rules.py @@ -12,11 +12,11 @@ MORPH_RULES = { 'কি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, 'সে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'}, 'কিসে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, - 'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, 'তাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'}, 'স্বয়ং': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'কোনগুলো': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, 'তুমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, + 'তুই': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, 'তাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'}, 'আমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'}, 'যিনি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'}, @@ -24,12 +24,15 @@ MORPH_RULES = { 'কোন': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, 'কারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, 'তোমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, + 'তোকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, 'খোদ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'কে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, 'যারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'}, 'যে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'}, 'তোমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, + 'তোরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, 'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, + 'তোদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, 'আপন': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'এ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'}, 'নিজ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, @@ -42,6 +45,10 @@ MORPH_RULES = { 'আমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'মোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'মোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, 'তার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, 'তোমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', @@ -50,7 +57,13 @@ MORPH_RULES = { 'Case': 'Nom'}, 'তোমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'তোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, 'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, + 'তোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'যাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, } } diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index 5b513da7b..ca0ae934a 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -22,7 +22,7 @@ STOP_WORDS = set(""" টি ঠিক তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই -তিনি তিনিও তুমি তুলে তেমন তো তোমার +তিনি তিনিও তুমি তুলে তেমন তো তোমার তুই তোরা তোর তোমাদের তোদের থাকবে থাকবেন থাকা থাকায় থাকে থাকেন থেকে থেকেই থেকেও থাকায় দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু দুটি দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয় দেশের দ্বারা দিয়েছে দিয়েছেন দেয় দেওয়া দেওয়ার দিন দুই @@ -32,7 +32,7 @@ STOP_WORDS = set(""" ফলে ফিরে ফের বছর বদলে বরং বলতে বলল বললেন বলা বলে বলেছেন বলেন বসে বহু বা বাদে বার বিনা বিভিন্ন বিশেষ বিষয়টি বেশ ব্যবহার ব্যাপারে বক্তব্য বন বেশি ভাবে ভাবেই -মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই +মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার যারা যায় যিনি যে যেখানে যেতে যেন যেমন রকম রয়েছে রাখা রেখে রয়েছে From 400812d9b17ac1ad054a2f4105ffae32dc45f945 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 22:17:11 +0200 Subject: [PATCH 39/99] Add add_label method to Pipe --- website/api/pipe.jade | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/website/api/pipe.jade b/website/api/pipe.jade index 66bdbcc62..c2afbde12 100644 --- a/website/api/pipe.jade +++ b/website/api/pipe.jade @@ -304,6 +304,21 @@ p Modify the pipe's model, to use the given parameter values. | The parameter values to use in the model. At the end of the | context, the original parameters are restored. ++h(2, "add_label") #{CLASSNAME}.add_label + +tag method + +p Add a new label to the pipe. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + #{VARNAME}.add_label('MY_LABEL') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code label] + +cell unicode + +cell The label to add. + +h(2, "to_disk") #{CLASSNAME}.to_disk +tag method From e6536d231fc92dab27438dc1d8731d67483c4948 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 25 Oct 2017 22:17:23 +0200 Subject: [PATCH 40/99] Update new entity type training example in docs --- website/usage/_training/_ner.jade | 56 ++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/website/usage/_training/_ner.jade b/website/usage/_training/_ner.jade index ff3101c8f..ed58c4c6f 100644 --- a/website/usage/_training/_ner.jade +++ b/website/usage/_training/_ner.jade @@ -24,28 +24,60 @@ p | #[strong experiment on your own data] to find a solution that works best | for you. -+h(3, "example-new-entity-type") Example: Training an additional entity type ++h(3, "example-new-entity-type") Training an additional entity type p - | This script shows how to add a new entity type to an existing pre-trained - | NER model. To keep the example short and simple, only a few sentences are + | This script shows how to add a new entity type #[code ANIMAL] to an + | existing pre-trained NER model, or an empty #[code Language] class. To + | keep the example short and simple, only a few sentences are | provided as examples. In practice, you'll need many more — a few hundred | would be a good start. You will also likely need to mix in examples of | other entity types, which might be obtained by running the entity | recognizer over unlabelled sentences, and adding their annotations to the | training set. -p - | The actual training is performed by looping over the examples, and - | calling #[+api("language#update") #[code nlp.update()]]. The - | #[code update] method steps through the words of the input. At each word, - | it makes a prediction. It then consults the annotations provided on the - | #[+api("goldparse") #[code GoldParse]] instance, to see whether it was - | right. If it was wrong, it adjusts its weights so that the correct - | action will score higher next time. - +github("spacy", "examples/training/train_new_entity_type.py") +p Training a new entity type requires the following steps: + ++list("numbers") + +item + | Create #[+api("doc") #[code Doc]] and + | #[+api("goldparse") #[code GoldParse]] objects for + | #[strong each example in your training data]. + + +item + | #[strong Load the model] you want to start with, or create an + | #[strong empty model] using + | #[+api("spacy#blank") #[code spacy.blank()]] with the ID of your + | language. If you're using an existing model, make sure to disable + | all other pipeline components during training using + | #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. This way, + | you'll only be training the entity recognizer. + + +item + | #[strong Add the new entity label] to the entity recognizer using the + | #[+api("entityrecognizer#add_label") #[code add_label]] method. You + | can access the entity recognizer in the pipeline via + | #[code nlp.get_pipe('ner')]. + + +item + | #[strong Loop over] the examples and call + | #[+api("language#update") #[code nlp.update]], which steps through + | the words of the input. At each word, it makes a + | #[strong prediction]. It then consults the annotations provided on the + | #[+api("goldparse") #[code GoldParse]] instance, to see whether it was + | right. If it was wrong, it adjusts its weights so that the correct + | action will score higher next time. + + +item + | #[strong Save] the trained model using + | #[+api("language#to_disk") #[code nlp.to_disk()]]. + + +item + | #[strong Test] the model to make sure the new entity is recognized + | correctly. + +h(3, "example-ner-from-scratch") Example: Training an NER system from scratch p From b0f3ea2200ab62bae2482884dbcce8e8e376c1d1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 12:38:23 +0200 Subject: [PATCH 41/99] Fix names of pipeline components NeuralDependencyParser --> DependencyParser NeuralEntityRecognizer --> EntityRecognizer TokenVectorEncoder --> Tensorizer NeuralLabeller --> MultitaskObjective --- spacy/language.py | 13 ++- spacy/pipeline.pxd | 21 ----- spacy/pipeline.pyx | 86 ++++--------------- spacy/tests/doc/test_add_entities.py | 3 +- spacy/tests/parser/test_add_label.py | 4 +- spacy/tests/parser/test_neural_parser.py | 2 +- spacy/tests/parser/test_preset_sbd.py | 4 +- spacy/tests/parser/test_to_from_bytes_disk.py | 6 +- .../serialize/test_serialize_parser_ner.py | 4 +- .../tests/serialize/test_serialize_tagger.py | 2 +- .../serialize/test_serialize_tensorizer.py | 2 +- 11 files changed, 35 insertions(+), 112 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 933ca772d..c4777898e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,8 +18,8 @@ from .tagger import Tagger from .lemmatizer import Lemmatizer from .syntax.parser import get_templates -from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger -from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer +from .pipeline import DependencyParser, Tensorizer, Tagger +from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer from .compat import json_dumps, izip, copy_reg from .scorer import Scorer @@ -75,9 +75,6 @@ class BaseDefaults(object): infixes = tuple(TOKENIZER_INFIXES) tag_map = dict(TAG_MAP) tokenizer_exceptions = {} - parser_features = get_templates('parser') - entity_features = get_templates('ner') - tagger_features = Tagger.feature_templates # TODO -- fix this stop_words = set() lemma_rules = {} lemma_exc = {} @@ -102,9 +99,9 @@ class Language(object): factories = { 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), 'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), - 'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), - 'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), - 'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), + 'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), + 'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), + 'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) } diff --git a/spacy/pipeline.pxd b/spacy/pipeline.pxd index e9b7f0f73..e69de29bb 100644 --- a/spacy/pipeline.pxd +++ b/spacy/pipeline.pxd @@ -1,21 +0,0 @@ -from .syntax.parser cimport Parser -#from .syntax.beam_parser cimport BeamParser -from .syntax.ner cimport BiluoPushDown -from .syntax.arc_eager cimport ArcEager -from .tagger cimport Tagger - - -cdef class EntityRecognizer(Parser): - pass - - -cdef class DependencyParser(Parser): - pass - - -#cdef class BeamEntityRecognizer(BeamParser): -# pass -# -# -#cdef class BeamDependencyParser(BeamParser): -# pass diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 7c1976dfa..6e4ef2f3e 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -26,11 +26,8 @@ from thinc.neural.util import to_categorical from thinc.neural._classes.difference import Siamese, CauchySimilarity from .tokens.doc cimport Doc -from .syntax.parser cimport Parser as LinearParser -from .syntax.nn_parser cimport Parser as NeuralParser +from .syntax.nn_parser cimport Parser from .syntax import nonproj -from .syntax.parser import get_templates as get_feature_templates -from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager from .tagger import Tagger @@ -217,7 +214,7 @@ def _load_cfg(path): return {} -class TokenVectorEncoder(BaseThincComponent): +class Tensorizer(BaseThincComponent): """Assign position-sensitive vectors to tokens, using a CNN or RNN.""" name = 'tensorizer' @@ -329,7 +326,7 @@ class TokenVectorEncoder(BaseThincComponent): link_vectors_to_models(self.vocab) -class NeuralTagger(BaseThincComponent): +class Tagger(BaseThincComponent): name = 'tagger' def __init__(self, vocab, model=True, **cfg): self.vocab = vocab @@ -513,7 +510,11 @@ class NeuralTagger(BaseThincComponent): return self -class NeuralLabeller(NeuralTagger): +class MultitaskObjective(Tagger): + '''Assist training of a parser or tagger, by training a side-objective. + + Experimental + ''' name = 'nn_labeller' def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): self.vocab = vocab @@ -532,7 +533,7 @@ class NeuralLabeller(NeuralTagger): self.make_label = target else: raise ValueError( - "NeuralLabeller target should be function or one of " + "MultitaskObjective target should be function or one of " "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']") self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 2) @@ -752,45 +753,7 @@ class TextCategorizer(BaseThincComponent): link_vectors_to_models(self.vocab) -cdef class EntityRecognizer(LinearParser): - """Annotate named entities on Doc objects.""" - TransitionSystem = BiluoPushDown - - feature_templates = get_feature_templates('ner') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - -cdef class BeamEntityRecognizer(BeamParser): - """Annotate named entities on Doc objects.""" - TransitionSystem = BiluoPushDown - - feature_templates = get_feature_templates('ner') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - -cdef class DependencyParser(LinearParser): - TransitionSystem = ArcEager - feature_templates = get_feature_templates('basic') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - @property - def postprocesses(self): - return [nonproj.deprojectivize] - - -cdef class NeuralDependencyParser(NeuralParser): +cdef class DependencyParser(Parser): name = 'parser' TransitionSystem = ArcEager @@ -800,17 +763,17 @@ cdef class NeuralDependencyParser(NeuralParser): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): for target in []: - labeller = NeuralLabeller(self.vocab, target=target) + labeller = MultitaskObjective(self.vocab, target=target) tok2vec = self.model[0] labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) pipeline.append(labeller) self._multitasks.append(labeller) def __reduce__(self): - return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) + return (DependencyParser, (self.vocab, self.moves, self.model), None, None) -cdef class NeuralEntityRecognizer(NeuralParser): +cdef class EntityRecognizer(Parser): name = 'ner' TransitionSystem = BiluoPushDown @@ -818,31 +781,14 @@ cdef class NeuralEntityRecognizer(NeuralParser): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): for target in []: - labeller = NeuralLabeller(self.vocab, target=target) + labeller = MultitaskObjective(self.vocab, target=target) tok2vec = self.model[0] labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) pipeline.append(labeller) self._multitasks.append(labeller) def __reduce__(self): - return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) + return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None) -cdef class BeamDependencyParser(BeamParser): - TransitionSystem = ArcEager - - feature_templates = get_feature_templates('basic') - - def add_label(self, label): - Parser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - @property - def postprocesses(self): - return [nonproj.deprojectivize] - - - -__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', - 'BeamEntityRecognizer', 'TokenVectorEnoder'] +__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer'] diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index cc74aa0ae..cd444ba81 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -10,7 +10,8 @@ import pytest def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab, features=[(2,), (3,)]) + ner = EntityRecognizer(en_vocab) + ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 3fbfc96a6..c3bceb106 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -9,7 +9,7 @@ from ...attrs import NORM from ...gold import GoldParse from ...vocab import Vocab from ...tokens import Doc -from ...pipeline import NeuralDependencyParser +from ...pipeline import DependencyParser numpy.random.seed(0) @@ -21,7 +21,7 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = NeuralDependencyParser(vocab) + parser = DependencyParser(vocab) parser.cfg['token_vector_width'] = 8 parser.cfg['hidden_width'] = 30 parser.cfg['hist_size'] = 0 diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index ae20cd5f0..e85c61276 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -6,7 +6,7 @@ import numpy from ..._ml import chain, Tok2Vec, doc2feats from ...vocab import Vocab -from ...pipeline import TokenVectorEncoder +from ...pipeline import Tensorizer from ...syntax.arc_eager import ArcEager from ...syntax.nn_parser import Parser from ...tokens.doc import Doc diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 4c973bd97..9b8c98735 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -8,7 +8,7 @@ from ...attrs import NORM from ...gold import GoldParse from ...vocab import Vocab from ...tokens import Doc -from ...pipeline import NeuralDependencyParser +from ...pipeline import DependencyParser @pytest.fixture def vocab(): @@ -16,7 +16,7 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = NeuralDependencyParser(vocab) + parser = DependencyParser(vocab) parser.cfg['token_vector_width'] = 4 parser.cfg['hidden_width'] = 32 #parser.add_label('right') diff --git a/spacy/tests/parser/test_to_from_bytes_disk.py b/spacy/tests/parser/test_to_from_bytes_disk.py index b0a10fa8e..48c412b7a 100644 --- a/spacy/tests/parser/test_to_from_bytes_disk.py +++ b/spacy/tests/parser/test_to_from_bytes_disk.py @@ -1,11 +1,11 @@ import pytest -from ...pipeline import NeuralDependencyParser +from ...pipeline import DependencyParser @pytest.fixture def parser(en_vocab): - parser = NeuralDependencyParser(en_vocab) + parser = DependencyParser(en_vocab) parser.add_label('nsubj') parser.model, cfg = parser.Model(parser.moves.n_moves) parser.cfg.update(cfg) @@ -14,7 +14,7 @@ def parser(en_vocab): @pytest.fixture def blank_parser(en_vocab): - parser = NeuralDependencyParser(en_vocab) + parser = DependencyParser(en_vocab) return parser diff --git a/spacy/tests/serialize/test_serialize_parser_ner.py b/spacy/tests/serialize/test_serialize_parser_ner.py index ae9e23e9a..cbe97b716 100644 --- a/spacy/tests/serialize/test_serialize_parser_ner.py +++ b/spacy/tests/serialize/test_serialize_parser_ner.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import NeuralDependencyParser as DependencyParser -from ...pipeline import NeuralEntityRecognizer as EntityRecognizer +from ...pipeline import DependencyParser +from ...pipeline import EntityRecognizer import pytest diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py index 475be1cef..7b7dedae0 100644 --- a/spacy/tests/serialize/test_serialize_tagger.py +++ b/spacy/tests/serialize/test_serialize_tagger.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import NeuralTagger as Tagger +from ...pipeline import Tagger import pytest diff --git a/spacy/tests/serialize/test_serialize_tensorizer.py b/spacy/tests/serialize/test_serialize_tensorizer.py index ba01a2fa6..bc751a686 100644 --- a/spacy/tests/serialize/test_serialize_tensorizer.py +++ b/spacy/tests/serialize/test_serialize_tensorizer.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import TokenVectorEncoder as Tensorizer +from ...pipeline import Tensorizer import pytest From a8abc47811e732ac49c402b0a0b41ca585d584c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 12:40:40 +0200 Subject: [PATCH 42/99] Rename BaseThincComponent --> Pipe --- spacy/pipeline.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 6e4ef2f3e..c52c29883 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -83,7 +83,7 @@ class SentenceSegmenter(object): yield doc[start : len(doc)] -class BaseThincComponent(object): +class Pipe(object): name = None @classmethod @@ -214,7 +214,7 @@ def _load_cfg(path): return {} -class Tensorizer(BaseThincComponent): +class Tensorizer(Pipe): """Assign position-sensitive vectors to tokens, using a CNN or RNN.""" name = 'tensorizer' @@ -326,7 +326,7 @@ class Tensorizer(BaseThincComponent): link_vectors_to_models(self.vocab) -class Tagger(BaseThincComponent): +class Tagger(Pipe): name = 'tagger' def __init__(self, vocab, model=True, **cfg): self.vocab = vocab @@ -623,7 +623,7 @@ class MultitaskObjective(Tagger): return '%s-%s' % (tags[i], ents[i]) -class SimilarityHook(BaseThincComponent): +class SimilarityHook(Pipe): """ Experimental @@ -675,7 +675,7 @@ class SimilarityHook(BaseThincComponent): link_vectors_to_models(self.vocab) -class TextCategorizer(BaseThincComponent): +class TextCategorizer(Pipe): name = 'textcat' @classmethod From 33f8c58782f96d787f862b32ead86f933a1a574e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 12:42:05 +0200 Subject: [PATCH 43/99] Remove obsolete parser.pyx --- spacy/syntax/_parse_features.pxd | 259 --------------- spacy/syntax/_parse_features.pyx | 419 ------------------------ spacy/syntax/beam_parser.pxd | 10 - spacy/syntax/beam_parser.pyx | 239 -------------- spacy/syntax/parser.pxd | 24 -- spacy/syntax/parser.pyx | 526 ------------------------------- 6 files changed, 1477 deletions(-) delete mode 100644 spacy/syntax/_parse_features.pxd delete mode 100644 spacy/syntax/_parse_features.pyx delete mode 100644 spacy/syntax/beam_parser.pxd delete mode 100644 spacy/syntax/beam_parser.pyx delete mode 100644 spacy/syntax/parser.pxd delete mode 100644 spacy/syntax/parser.pyx diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd deleted file mode 100644 index 0842e3504..000000000 --- a/spacy/syntax/_parse_features.pxd +++ /dev/null @@ -1,259 +0,0 @@ -from thinc.typedefs cimport atom_t - -from .stateclass cimport StateClass -from ._state cimport StateC - - -cdef int fill_context(atom_t* context, const StateC* state) nogil -# Context elements - -# Ensure each token's attributes are listed: w, p, c, c6, c4. The order -# is referenced by incrementing the enum... - -# Tokens are listed in left-to-right order. -#cdef size_t* SLOTS = [ -# S2w, S1w, -# S0l0w, S0l2w, S0lw, -# S0w, -# S0r0w, S0r2w, S0rw, -# N0l0w, N0l2w, N0lw, -# P2w, P1w, -# N0w, N1w, N2w, N3w, 0 -#] - -# NB: The order of the enum is _NOT_ arbitrary!! -cpdef enum: - S2w - S2W - S2p - S2c - S2c4 - S2c6 - S2L - S2_prefix - S2_suffix - S2_shape - S2_ne_iob - S2_ne_type - - S1w - S1W - S1p - S1c - S1c4 - S1c6 - S1L - S1_prefix - S1_suffix - S1_shape - S1_ne_iob - S1_ne_type - - S1rw - S1rW - S1rp - S1rc - S1rc4 - S1rc6 - S1rL - S1r_prefix - S1r_suffix - S1r_shape - S1r_ne_iob - S1r_ne_type - - S0lw - S0lW - S0lp - S0lc - S0lc4 - S0lc6 - S0lL - S0l_prefix - S0l_suffix - S0l_shape - S0l_ne_iob - S0l_ne_type - - S0l2w - S0l2W - S0l2p - S0l2c - S0l2c4 - S0l2c6 - S0l2L - S0l2_prefix - S0l2_suffix - S0l2_shape - S0l2_ne_iob - S0l2_ne_type - - S0w - S0W - S0p - S0c - S0c4 - S0c6 - S0L - S0_prefix - S0_suffix - S0_shape - S0_ne_iob - S0_ne_type - - S0r2w - S0r2W - S0r2p - S0r2c - S0r2c4 - S0r2c6 - S0r2L - S0r2_prefix - S0r2_suffix - S0r2_shape - S0r2_ne_iob - S0r2_ne_type - - S0rw - S0rW - S0rp - S0rc - S0rc4 - S0rc6 - S0rL - S0r_prefix - S0r_suffix - S0r_shape - S0r_ne_iob - S0r_ne_type - - N0l2w - N0l2W - N0l2p - N0l2c - N0l2c4 - N0l2c6 - N0l2L - N0l2_prefix - N0l2_suffix - N0l2_shape - N0l2_ne_iob - N0l2_ne_type - - N0lw - N0lW - N0lp - N0lc - N0lc4 - N0lc6 - N0lL - N0l_prefix - N0l_suffix - N0l_shape - N0l_ne_iob - N0l_ne_type - - N0w - N0W - N0p - N0c - N0c4 - N0c6 - N0L - N0_prefix - N0_suffix - N0_shape - N0_ne_iob - N0_ne_type - - N1w - N1W - N1p - N1c - N1c4 - N1c6 - N1L - N1_prefix - N1_suffix - N1_shape - N1_ne_iob - N1_ne_type - - N2w - N2W - N2p - N2c - N2c4 - N2c6 - N2L - N2_prefix - N2_suffix - N2_shape - N2_ne_iob - N2_ne_type - - P1w - P1W - P1p - P1c - P1c4 - P1c6 - P1L - P1_prefix - P1_suffix - P1_shape - P1_ne_iob - P1_ne_type - - P2w - P2W - P2p - P2c - P2c4 - P2c6 - P2L - P2_prefix - P2_suffix - P2_shape - P2_ne_iob - P2_ne_type - - E0w - E0W - E0p - E0c - E0c4 - E0c6 - E0L - E0_prefix - E0_suffix - E0_shape - E0_ne_iob - E0_ne_type - - E1w - E1W - E1p - E1c - E1c4 - E1c6 - E1L - E1_prefix - E1_suffix - E1_shape - E1_ne_iob - E1_ne_type - - # Misc features at the end - dist - N0lv - S0lv - S0rv - S1lv - S1rv - - S0_has_head - S1_has_head - S2_has_head - - CONTEXT_SIZE diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx deleted file mode 100644 index 2e0db4877..000000000 --- a/spacy/syntax/_parse_features.pyx +++ /dev/null @@ -1,419 +0,0 @@ -""" -Fill an array, context, with every _atomic_ value our features reference. -We then write the _actual features_ as tuples of the atoms. The machinery -that translates from the tuples to feature-extractors (which pick the values -out of "context") is in features/extractor.pyx - -The atomic feature names are listed in a big enum, so that the feature tuples -can refer to them. -""" -# coding: utf-8 -from __future__ import unicode_literals - -from libc.string cimport memset -from itertools import combinations -from cymem.cymem cimport Pool - -from ..structs cimport TokenC -from .stateclass cimport StateClass -from ._state cimport StateC - - -cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: - if token is NULL: - context[0] = 0 - context[1] = 0 - context[2] = 0 - context[3] = 0 - context[4] = 0 - context[5] = 0 - context[6] = 0 - context[7] = 0 - context[8] = 0 - context[9] = 0 - context[10] = 0 - context[11] = 0 - else: - context[0] = token.lex.orth - context[1] = token.lemma - context[2] = token.tag - context[3] = token.lex.cluster - # We've read in the string little-endian, so now we can take & (2**n)-1 - # to get the first n bits of the cluster. - # e.g. s = "1110010101" - # s = ''.join(reversed(s)) - # first_4_bits = int(s, 2) - # print first_4_bits - # 5 - # print "{0:b}".format(prefix).ljust(4, '0') - # 1110 - # What we're doing here is picking a number where all bits are 1, e.g. - # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in - # the source that are set to 1. - context[4] = token.lex.cluster & 15 - context[5] = token.lex.cluster & 63 - context[6] = token.dep if token.head != 0 else 0 - context[7] = token.lex.prefix - context[8] = token.lex.suffix - context[9] = token.lex.shape - context[10] = token.ent_iob - context[11] = token.ent_type - -cdef int fill_context(atom_t* ctxt, const StateC* st) nogil: - # Take care to fill every element of context! - # We could memset, but this makes it very easy to have broken features that - # make almost no impact on accuracy. If instead they're unset, the impact - # tends to be dramatic, so we get an obvious regression to fix... - fill_token(&ctxt[S2w], st.S_(2)) - fill_token(&ctxt[S1w], st.S_(1)) - fill_token(&ctxt[S1rw], st.R_(st.S(1), 1)) - fill_token(&ctxt[S0lw], st.L_(st.S(0), 1)) - fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2)) - fill_token(&ctxt[S0w], st.S_(0)) - fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2)) - fill_token(&ctxt[S0rw], st.R_(st.S(0), 1)) - fill_token(&ctxt[N0lw], st.L_(st.B(0), 1)) - fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2)) - fill_token(&ctxt[N0w], st.B_(0)) - fill_token(&ctxt[N1w], st.B_(1)) - fill_token(&ctxt[N2w], st.B_(2)) - fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1)) - fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2)) - - fill_token(&ctxt[E0w], st.E_(0)) - fill_token(&ctxt[E1w], st.E_(1)) - - if st.stack_depth() >= 1 and not st.eol(): - ctxt[dist] = min_(st.B(0) - st.E(0), 5) - else: - ctxt[dist] = 0 - ctxt[N0lv] = min_(st.n_L(st.B(0)), 5) - ctxt[S0lv] = min_(st.n_L(st.S(0)), 5) - ctxt[S0rv] = min_(st.n_R(st.S(0)), 5) - ctxt[S1lv] = min_(st.n_L(st.S(1)), 5) - ctxt[S1rv] = min_(st.n_R(st.S(1)), 5) - - ctxt[S0_has_head] = 0 - ctxt[S1_has_head] = 0 - ctxt[S2_has_head] = 0 - if st.stack_depth() >= 1: - ctxt[S0_has_head] = st.has_head(st.S(0)) + 1 - if st.stack_depth() >= 2: - ctxt[S1_has_head] = st.has_head(st.S(1)) + 1 - if st.stack_depth() >= 3: - ctxt[S2_has_head] = st.has_head(st.S(2)) + 1 - - -cdef inline int min_(int a, int b) nogil: - return a if a > b else b - - -ner = ( - (N0W,), - (P1W,), - (N1W,), - (P2W,), - (N2W,), - - (P1W, N0W,), - (N0W, N1W), - - (N0_prefix,), - (N0_suffix,), - - (P1_shape,), - (N0_shape,), - (N1_shape,), - (P1_shape, N0_shape,), - (N0_shape, P1_shape,), - (P1_shape, N0_shape, N1_shape), - (N2_shape,), - (P2_shape,), - - #(P2_norm, P1_norm, W_norm), - #(P1_norm, W_norm, N1_norm), - #(W_norm, N1_norm, N2_norm) - - (P2p,), - (P1p,), - (N0p,), - (N1p,), - (N2p,), - - (P1p, N0p), - (N0p, N1p), - (P2p, P1p, N0p), - (P1p, N0p, N1p), - (N0p, N1p, N2p), - - (P2c,), - (P1c,), - (N0c,), - (N1c,), - (N2c,), - - (P1c, N0c), - (N0c, N1c), - - (E0W,), - (E0c,), - (E0p,), - - (E0W, N0W), - (E0c, N0W), - (E0p, N0W), - - (E0p, P1p, N0p), - (E0c, P1c, N0c), - - (E0w, P1c), - (E0p, P1p), - (E0c, P1c), - (E0p, E1p), - (E0c, P1p), - - (E1W,), - (E1c,), - (E1p,), - - (E0W, E1W), - (E0W, E1p,), - (E0p, E1W,), - (E0p, E1W), - - (P1_ne_iob,), - (P1_ne_iob, P1_ne_type), - (N0w, P1_ne_iob, P1_ne_type), - - (N0_shape,), - (N1_shape,), - (N2_shape,), - (P1_shape,), - (P2_shape,), - - (N0_prefix,), - (N0_suffix,), - - (P1_ne_iob,), - (P2_ne_iob,), - (P1_ne_iob, P2_ne_iob), - (P1_ne_iob, P1_ne_type), - (P2_ne_iob, P2_ne_type), - (N0w, P1_ne_iob, P1_ne_type), - - (N0w, N1w), -) - - -unigrams = ( - (S2W, S2p), - (S2c6, S2p), - - (S1W, S1p), - (S1c6, S1p), - - (S0W, S0p), - (S0c6, S0p), - - (N0W, N0p), - (N0p,), - (N0c,), - (N0c6, N0p), - (N0L,), - - (N1W, N1p), - (N1c6, N1p), - - (N2W, N2p), - (N2c6, N2p), - - (S0r2W, S0r2p), - (S0r2c6, S0r2p), - (S0r2L,), - - (S0rW, S0rp), - (S0rc6, S0rp), - (S0rL,), - - (S0l2W, S0l2p), - (S0l2c6, S0l2p), - (S0l2L,), - - (S0lW, S0lp), - (S0lc6, S0lp), - (S0lL,), - - (N0l2W, N0l2p), - (N0l2c6, N0l2p), - (N0l2L,), - - (N0lW, N0lp), - (N0lc6, N0lp), - (N0lL,), -) - - -s0_n0 = ( - (S0W, S0p, N0W, N0p), - (S0c, S0p, N0c, N0p), - (S0c6, S0p, N0c6, N0p), - (S0c4, S0p, N0c4, N0p), - (S0p, N0p), - (S0W, N0p), - (S0p, N0W), - (S0W, N0c), - (S0c, N0W), - (S0p, N0c), - (S0c, N0p), - (S0W, S0rp, N0p), - (S0p, S0rp, N0p), - (S0p, N0lp, N0W), - (S0p, N0lp, N0p), - (S0L, N0p), - (S0p, S0rL, N0p), - (S0p, N0lL, N0p), - (S0p, S0rv, N0p), - (S0p, N0lv, N0p), - (S0c6, S0rL, S0r2L, N0p), - (S0p, N0lL, N0l2L, N0p), -) - - -s1_s0 = ( - (S1p, S0p), - (S1p, S0p, S0_has_head), - (S1W, S0p), - (S1W, S0p, S0_has_head), - (S1c, S0p), - (S1c, S0p, S0_has_head), - (S1p, S1rL, S0p), - (S1p, S1rL, S0p, S0_has_head), - (S1p, S0lL, S0p), - (S1p, S0lL, S0p, S0_has_head), - (S1p, S0lL, S0l2L, S0p), - (S1p, S0lL, S0l2L, S0p, S0_has_head), - (S1L, S0L, S0W), - (S1L, S0L, S0p), - (S1p, S1L, S0L, S0p), - (S1p, S0p), -) - - -s1_n0 = ( - (S1p, N0p), - (S1c, N0c), - (S1c, N0p), - (S1p, N0c), - (S1W, S1p, N0p), - (S1p, N0W, N0p), - (S1c6, S1p, N0c6, N0p), - (S1L, N0p), - (S1p, S1rL, N0p), - (S1p, S1rp, N0p), -) - - -s0_n1 = ( - (S0p, N1p), - (S0c, N1c), - (S0c, N1p), - (S0p, N1c), - (S0W, S0p, N1p), - (S0p, N1W, N1p), - (S0c6, S0p, N1c6, N1p), - (S0L, N1p), - (S0p, S0rL, N1p), -) - - -n0_n1 = ( - (N0W, N0p, N1W, N1p), - (N0W, N0p, N1p), - (N0p, N1W, N1p), - (N0c, N0p, N1c, N1p), - (N0c6, N0p, N1c6, N1p), - (N0c, N1c), - (N0p, N1c), -) - -tree_shape = ( - (dist,), - (S0p, S0_has_head, S1_has_head, S2_has_head), - (S0p, S0lv, S0rv), - (N0p, N0lv), -) - -trigrams = ( - (N0p, N1p, N2p), - (S0p, S0lp, S0l2p), - (S0p, S0rp, S0r2p), - (S0p, S1p, S2p), - (S1p, S0p, N0p), - (S0p, S0lp, N0p), - (S0p, N0p, N0lp), - (N0p, N0lp, N0l2p), - - (S0W, S0p, S0rL, S0r2L), - (S0p, S0rL, S0r2L), - - (S0W, S0p, S0lL, S0l2L), - (S0p, S0lL, S0l2L), - - (N0W, N0p, N0lL, N0l2L), - (N0p, N0lL, N0l2L), -) - - -words = ( - S2w, - S1w, - S1rw, - S0lw, - S0l2w, - S0w, - S0r2w, - S0rw, - N0lw, - N0l2w, - N0w, - N1w, - N2w, - P1w, - P2w -) - -tags = ( - S2p, - S1p, - S1rp, - S0lp, - S0l2p, - S0p, - S0r2p, - S0rp, - N0lp, - N0l2p, - N0p, - N1p, - N2p, - P1p, - P2p -) - -labels = ( - S2L, - S1L, - S1rL, - S0lL, - S0l2L, - S0L, - S0r2L, - S0rL, - N0lL, - N0l2L, - N0L, - N1L, - N2L, - P1L, - P2L -) diff --git a/spacy/syntax/beam_parser.pxd b/spacy/syntax/beam_parser.pxd deleted file mode 100644 index 35a60cbf3..000000000 --- a/spacy/syntax/beam_parser.pxd +++ /dev/null @@ -1,10 +0,0 @@ -from .parser cimport Parser -from ..structs cimport TokenC -from thinc.typedefs cimport weight_t - - -cdef class BeamParser(Parser): - cdef public int beam_width - cdef public weight_t beam_density - - cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1 diff --git a/spacy/syntax/beam_parser.pyx b/spacy/syntax/beam_parser.pyx deleted file mode 100644 index 68e9f27af..000000000 --- a/spacy/syntax/beam_parser.pyx +++ /dev/null @@ -1,239 +0,0 @@ -""" -MALT-style dependency parser -""" -# cython: profile=True -# cython: experimental_cpp_class_def=True -# cython: cdivision=True -# cython: infer_types=True -# coding: utf-8 - -from __future__ import unicode_literals, print_function -cimport cython - -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memset, memcpy -from libc.stdlib cimport rand -from libc.math cimport log, exp, isnan, isinf -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport real_hash64 as hash64 -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t -from thinc.linear.features cimport ConjunctionExtracter -from thinc.structs cimport FeatureC, ExampleC -from thinc.extra.search cimport Beam, MaxViolation -from thinc.extra.eg cimport Example -from thinc.extra.mb cimport Minibatch - -from ..structs cimport TokenC -from ..tokens.doc cimport Doc -from ..strings cimport StringStore -from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParse -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context -from .stateclass cimport StateClass -from .parser cimport Parser - - -DEBUG = False -def set_debug(val): - global DEBUG - DEBUG = val - - -def get_templates(name): - pf = _parse_features - if name == 'ner': - return pf.ner - elif name == 'debug': - return pf.unigrams - else: - return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ - pf.tree_shape + pf.trigrams) - - -cdef int BEAM_WIDTH = 16 -cdef weight_t BEAM_DENSITY = 0.001 - -cdef class BeamParser(Parser): - def __init__(self, *args, **kwargs): - self.beam_width = kwargs.get('beam_width', BEAM_WIDTH) - self.beam_density = kwargs.get('beam_density', BEAM_DENSITY) - Parser.__init__(self, *args, **kwargs) - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: - with gil: - self._parseC(tokens, length, nr_feat, self.moves.n_moves) - - cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1: - cdef Beam beam = Beam(self.moves.n_moves, self.beam_width, min_density=self.beam_density) - # TODO: How do we handle new labels here? This increases nr_class - beam.initialize(self.moves.init_beam_state, length, tokens) - beam.check_done(_check_final_state, NULL) - if beam.is_done: - _cleanup(beam) - return 0 - while not beam.is_done: - self._advance_beam(beam, None, False) - state = beam.at(0) - self.moves.finalize_state(state.c) - for i in range(length): - tokens[i] = state.c._sent[i] - _cleanup(beam) - - def update(self, Doc tokens, GoldParse gold_parse, itn=0): - self.moves.preprocess_gold(gold_parse) - cdef Beam pred = Beam(self.moves.n_moves, self.beam_width) - pred.initialize(self.moves.init_beam_state, tokens.length, tokens.c) - pred.check_done(_check_final_state, NULL) - # Hack for NER - for i in range(pred.size): - stcls = pred.at(i) - self.moves.initialize_state(stcls.c) - - cdef Beam gold = Beam(self.moves.n_moves, self.beam_width, min_density=0.0) - gold.initialize(self.moves.init_beam_state, tokens.length, tokens.c) - gold.check_done(_check_final_state, NULL) - violn = MaxViolation() - while not pred.is_done and not gold.is_done: - # We search separately here, to allow for ambiguity in the gold parse. - self._advance_beam(pred, gold_parse, False) - self._advance_beam(gold, gold_parse, True) - violn.check_crf(pred, gold) - if pred.loss > 0 and pred.min_score > (gold.score + self.model.time): - break - else: - # The non-monotonic oracle makes it difficult to ensure final costs are - # correct. Therefore do final correction - for i in range(pred.size): - if self.moves.is_gold_parse(pred.at(i), gold_parse): - pred._states[i].loss = 0.0 - elif pred._states[i].loss == 0.0: - pred._states[i].loss = 1.0 - violn.check_crf(pred, gold) - if pred.size < 1: - raise Exception("No candidates", tokens.length) - if gold.size < 1: - raise Exception("No gold", tokens.length) - if pred.loss == 0: - self.model.update_from_histories(self.moves, tokens, [(0.0, [])]) - elif True: - #_check_train_integrity(pred, gold, gold_parse, self.moves) - histories = list(zip(violn.p_probs, violn.p_hist)) + \ - list(zip(violn.g_probs, violn.g_hist)) - self.model.update_from_histories(self.moves, tokens, histories, min_grad=0.001**(itn+1)) - else: - self.model.update_from_histories(self.moves, tokens, - [(1.0, violn.p_hist[0]), (-1.0, violn.g_hist[0])]) - _cleanup(pred) - _cleanup(gold) - return pred.loss - - def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): - cdef atom_t[CONTEXT_SIZE] context - cdef Pool mem = Pool() - features = mem.alloc(self.model.nr_feat, sizeof(FeatureC)) - if False: - mb = Minibatch(self.model.widths, beam.size) - for i in range(beam.size): - stcls = beam.at(i) - if stcls.c.is_final(): - nr_feat = 0 - else: - nr_feat = self.model.set_featuresC(context, features, stcls.c) - self.moves.set_valid(beam.is_valid[i], stcls.c) - mb.c.push_back(features, nr_feat, beam.costs[i], beam.is_valid[i], 0) - self.model(mb) - for i in range(beam.size): - memcpy(beam.scores[i], mb.c.scores(i), mb.c.nr_out() * sizeof(beam.scores[i][0])) - else: - for i in range(beam.size): - stcls = beam.at(i) - if not stcls.is_final(): - nr_feat = self.model.set_featuresC(context, features, stcls.c) - self.moves.set_valid(beam.is_valid[i], stcls.c) - self.model.set_scoresC(beam.scores[i], features, nr_feat) - if gold is not None: - n_gold = 0 - lines = [] - for i in range(beam.size): - stcls = beam.at(i) - if not stcls.c.is_final(): - self.moves.set_costs(beam.is_valid[i], beam.costs[i], stcls, gold) - if follow_gold: - for j in range(self.moves.n_moves): - if beam.costs[i][j] >= 1: - beam.is_valid[i][j] = 0 - lines.append((stcls.B(0), stcls.B(1), - stcls.B_(0).ent_iob, stcls.B_(1).ent_iob, - stcls.B_(1).sent_start, - j, - beam.is_valid[i][j], 'set invalid', - beam.costs[i][j], self.moves.c[j].move, self.moves.c[j].label)) - n_gold += 1 if beam.is_valid[i][j] else 0 - if follow_gold and n_gold == 0: - raise Exception("No gold") - if follow_gold: - beam.advance(_transition_state, NULL, self.moves.c) - else: - beam.advance(_transition_state, _hash_state, self.moves.c) - beam.check_done(_check_final_state, NULL) - - -# These are passed as callbacks to thinc.search.Beam -cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src - moves = _moves - dest.clone(src) - moves[clas].do(dest.c, moves[clas].label) - - -cdef int _check_final_state(void* _state, void* extra_args) except -1: - return (_state).is_final() - - -def _cleanup(Beam beam): - for i in range(beam.width): - Py_XDECREF(beam._states[i].content) - Py_XDECREF(beam._parents[i].content) - - -cdef hash_t _hash_state(void* _state, void* _) except 0: - state = _state - if state.c.is_final(): - return 1 - else: - return state.c.hash() - - -def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, TransitionSystem moves): - for i in range(pred.size): - if not pred._states[i].is_done or pred._states[i].loss == 0: - continue - state = pred.at(i) - if moves.is_gold_parse(state, gold_parse) == True: - for dep in gold_parse.orig_annot: - print(dep[1], dep[3], dep[4]) - print("Cost", pred._states[i].loss) - for j in range(gold_parse.length): - print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep]) - acts = [moves.c[clas].move for clas in pred.histories[i]] - labels = [moves.c[clas].label for clas in pred.histories[i]] - print([moves.move_name(move, label) for move, label in zip(acts, labels)]) - raise Exception("Predicted state is gold-standard") - for i in range(gold.size): - if not gold._states[i].is_done: - continue - state = gold.at(i) - if moves.is_gold(state, gold_parse) == False: - print("Truth") - for dep in gold_parse.orig_annot: - print(dep[1], dep[3], dep[4]) - print("Predicted good") - for j in range(gold_parse.length): - print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep]) - raise Exception("Gold parse is not gold-standard") - - diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd deleted file mode 100644 index 95b6c3d3f..000000000 --- a/spacy/syntax/parser.pxd +++ /dev/null @@ -1,24 +0,0 @@ -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.typedefs cimport atom_t -from thinc.structs cimport FeatureC - -from .stateclass cimport StateClass -from .arc_eager cimport TransitionSystem -from ..vocab cimport Vocab -from ..tokens.doc cimport Doc -from ..structs cimport TokenC -from ._state cimport StateC - - -cdef class ParserModel(AveragedPerceptron): - cdef int set_featuresC(self, atom_t* context, FeatureC* features, - const StateC* state) nogil - - -cdef class Parser: - cdef readonly Vocab vocab - cdef readonly ParserModel model - cdef readonly TransitionSystem moves - cdef readonly object cfg - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx deleted file mode 100644 index 78698db12..000000000 --- a/spacy/syntax/parser.pyx +++ /dev/null @@ -1,526 +0,0 @@ -""" -MALT-style dependency parser -""" -# coding: utf-8 -# cython: infer_types=True -from __future__ import unicode_literals - -from collections import Counter -import ujson - -cimport cython -cimport cython.parallel - -import numpy.random - -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -from cpython.exc cimport PyErr_CheckSignals -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memset, memcpy -from libc.stdlib cimport malloc, calloc, free -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.linalg cimport VecVec -from thinc.structs cimport SparseArrayC, FeatureC, ExampleC -from thinc.extra.eg cimport Example -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport hash64 -from preshed.maps cimport MapStruct -from preshed.maps cimport map_get - -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context -from .stateclass cimport StateClass -from ._state cimport StateC -from .transition_system import OracleError -from .transition_system cimport TransitionSystem, Transition -from ..structs cimport TokenC -from ..tokens.doc cimport Doc -from ..strings cimport StringStore -from ..gold cimport GoldParse - - -USE_FTRL = True -DEBUG = False -def set_debug(val): - global DEBUG - DEBUG = val - - -def get_templates(name): - pf = _parse_features - if name == 'ner': - return pf.ner - elif name == 'debug': - return pf.unigrams - elif name.startswith('embed'): - return (pf.words, pf.tags, pf.labels) - else: - return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ - pf.tree_shape + pf.trigrams) - - -cdef class ParserModel(AveragedPerceptron): - cdef int set_featuresC(self, atom_t* context, FeatureC* features, - const StateC* state) nogil: - fill_context(context, state) - nr_feat = self.extracter.set_features(features, context) - return nr_feat - - def update(self, Example eg, itn=0): - """ - Does regression on negative cost. Sort of cute? - """ - self.time += 1 - cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class) - cdef int guess = eg.guess - if guess == best or best == -1: - return 0.0 - cdef FeatureC feat - cdef int clas - cdef weight_t gradient - if USE_FTRL: - for feat in eg.c.features[:eg.c.nr_feat]: - for clas in range(eg.c.nr_class): - if eg.c.is_valid[clas] and eg.c.scores[clas] >= eg.c.scores[best]: - gradient = eg.c.scores[clas] + eg.c.costs[clas] - self.update_weight_ftrl(feat.key, clas, feat.value * gradient) - else: - for feat in eg.c.features[:eg.c.nr_feat]: - self.update_weight(feat.key, guess, feat.value * eg.c.costs[guess]) - self.update_weight(feat.key, best, -feat.value * eg.c.costs[guess]) - return eg.c.costs[guess] - - def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0): - cdef Pool mem = Pool() - features = mem.alloc(self.nr_feat, sizeof(FeatureC)) - - cdef StateClass stcls - - cdef class_t clas - self.time += 1 - cdef atom_t[CONTEXT_SIZE] atoms - histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist] - if not histories: - return None - gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))] - for d_loss, history in histories: - stcls = StateClass.init(doc.c, doc.length) - moves.initialize_state(stcls.c) - for clas in history: - nr_feat = self.set_featuresC(atoms, features, stcls.c) - clas_grad = gradient[clas] - for feat in features[:nr_feat]: - clas_grad[feat.key] += d_loss * feat.value - moves.c[clas].do(stcls.c, moves.c[clas].label) - cdef feat_t key - cdef weight_t d_feat - for clas, clas_grad in enumerate(gradient): - for key, d_feat in clas_grad.items(): - if d_feat != 0: - self.update_weight_ftrl(key, clas, d_feat) - - -cdef class Parser: - """ - Base class of the DependencyParser and EntityRecognizer. - """ - @classmethod - def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): - """ - Load the statistical model from the supplied path. - - Arguments: - path (Path): - The path to load from. - vocab (Vocab): - The vocabulary. Must be shared by the documents to be processed. - require (bool): - Whether to raise an error if the files are not found. - Returns (Parser): - The newly constructed object. - """ - with (path / 'config.json').open() as file_: - cfg = ujson.load(file_) - # TODO: remove this shim when we don't have to support older data - if 'labels' in cfg and 'actions' not in cfg: - cfg['actions'] = cfg.pop('labels') - # TODO: remove this shim when we don't have to support older data - for action_name, labels in dict(cfg.get('actions', {})).items(): - # We need this to be sorted - if isinstance(labels, dict): - labels = list(sorted(labels.keys())) - cfg['actions'][action_name] = labels - self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) - if (path / 'model').exists(): - self.model.load(str(path / 'model')) - elif require: - raise IOError( - "Required file %s/model not found when loading" % str(path)) - return self - - def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): - """ - Create a Parser. - - Arguments: - vocab (Vocab): - The vocabulary object. Must be shared with documents to be processed. - model (thinc.linear.AveragedPerceptron): - The statistical model. - Returns (Parser): - The newly constructed object. - """ - if TransitionSystem is None: - TransitionSystem = self.TransitionSystem - self.vocab = vocab - cfg['actions'] = TransitionSystem.get_actions(**cfg) - self.moves = TransitionSystem(vocab.strings, cfg['actions']) - # TODO: Remove this when we no longer need to support old-style models - if isinstance(cfg.get('features'), basestring): - cfg['features'] = get_templates(cfg['features']) - elif 'features' not in cfg: - cfg['features'] = self.feature_templates - - self.model = ParserModel(cfg['features']) - self.model.l1_penalty = cfg.get('L1', 0.0) - self.model.learn_rate = cfg.get('learn_rate', 0.001) - - self.cfg = cfg - # TODO: This is a pretty hacky fix to the problem of adding more - # labels. The issue is they come in out of order, if labels are - # added during training - for label in cfg.get('extra_labels', []): - self.add_label(label) - - def __reduce__(self): - return (Parser, (self.vocab, self.moves, self.model), None, None) - - def __call__(self, Doc tokens): - """ - Apply the entity recognizer, setting the annotations onto the Doc object. - - Arguments: - doc (Doc): The document to be processed. - Returns: - None - """ - cdef int nr_feat = self.model.nr_feat - with nogil: - status = self.parseC(tokens.c, tokens.length, nr_feat) - # Check for KeyboardInterrupt etc. Untested - PyErr_CheckSignals() - if status != 0: - raise ParserStateError(tokens) - self.moves.finalize_doc(tokens) - - def pipe(self, stream, int batch_size=1000, int n_threads=2): - """ - Process a stream of documents. - - Arguments: - stream: The sequence of documents to process. - batch_size (int): - The number of documents to accumulate into a working set. - n_threads (int): - The number of threads with which to work on the buffer in parallel. - Yields (Doc): Documents, in order. - """ - cdef Pool mem = Pool() - cdef TokenC** doc_ptr = mem.alloc(batch_size, sizeof(TokenC*)) - cdef int* lengths = mem.alloc(batch_size, sizeof(int)) - cdef Doc doc - cdef int i - cdef int nr_feat = self.model.nr_feat - cdef int status - queue = [] - for doc in stream: - doc_ptr[len(queue)] = doc.c - lengths[len(queue)] = doc.length - queue.append(doc) - if len(queue) == batch_size: - with nogil: - for i in cython.parallel.prange(batch_size, num_threads=n_threads): - status = self.parseC(doc_ptr[i], lengths[i], nr_feat) - if status != 0: - with gil: - raise ParserStateError(queue[i]) - PyErr_CheckSignals() - for doc in queue: - self.moves.finalize_doc(doc) - yield doc - queue = [] - batch_size = len(queue) - with nogil: - for i in cython.parallel.prange(batch_size, num_threads=n_threads): - status = self.parseC(doc_ptr[i], lengths[i], nr_feat) - if status != 0: - with gil: - raise ParserStateError(queue[i]) - PyErr_CheckSignals() - for doc in queue: - self.moves.finalize_doc(doc) - yield doc - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: - state = new StateC(tokens, length) - # NB: This can change self.moves.n_moves! - # I think this causes memory errors if called by .pipe() - self.moves.initialize_state(state) - nr_class = self.moves.n_moves - - cdef ExampleC eg - eg.nr_feat = nr_feat - eg.nr_atom = CONTEXT_SIZE - eg.nr_class = nr_class - eg.features = calloc(sizeof(FeatureC), nr_feat) - eg.atoms = calloc(sizeof(atom_t), CONTEXT_SIZE) - eg.scores = calloc(sizeof(weight_t), nr_class) - eg.is_valid = calloc(sizeof(int), nr_class) - cdef int i - while not state.is_final(): - eg.nr_feat = self.model.set_featuresC(eg.atoms, eg.features, state) - self.moves.set_valid(eg.is_valid, state) - self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat) - - guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class) - if guess < 0: - return 1 - - action = self.moves.c[guess] - - action.do(state, action.label) - memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class) - for i in range(eg.nr_class): - eg.is_valid[i] = 1 - self.moves.finalize_state(state) - for i in range(length): - tokens[i] = state._sent[i] - del state - free(eg.features) - free(eg.atoms) - free(eg.scores) - free(eg.is_valid) - return 0 - - def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0): - """ - Update the statistical model. - - Arguments: - doc (Doc): - The example document for the update. - gold (GoldParse): - The gold-standard annotations, to calculate the loss. - Returns (float): - The loss on this example. - """ - self.moves.preprocess_gold(gold) - cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) - self.moves.initialize_state(stcls.c) - cdef Pool mem = Pool() - cdef Example eg = Example( - nr_class=self.moves.n_moves, - nr_atom=CONTEXT_SIZE, - nr_feat=self.model.nr_feat) - cdef weight_t loss = 0 - cdef Transition action - cdef double dropout_rate = self.cfg.get('dropout', drop) - while not stcls.is_final(): - eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features, - stcls.c) - dropout(eg.c.features, eg.c.nr_feat, dropout_rate) - self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) - self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) - guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) - self.model.update(eg) - - action = self.moves.c[guess] - action.do(stcls.c, action.label) - loss += eg.costs[guess] - eg.fill_scores(0, eg.c.nr_class) - eg.fill_costs(0, eg.c.nr_class) - eg.fill_is_valid(1, eg.c.nr_class) - - self.moves.finalize_state(stcls.c) - return loss - - def step_through(self, Doc doc, GoldParse gold=None): - """ - Set up a stepwise state, to introspect and control the transition sequence. - - Arguments: - doc (Doc): The document to step through. - gold (GoldParse): Optional gold parse - Returns (StepwiseState): - A state object, to step through the annotation process. - """ - return StepwiseState(self, doc, gold=gold) - - def from_transition_sequence(self, Doc doc, sequence): - """Control the annotations on a document by specifying a transition sequence - to follow. - - Arguments: - doc (Doc): The document to annotate. - sequence: A sequence of action names, as unicode strings. - Returns: None - """ - with self.step_through(doc) as stepwise: - for transition in sequence: - stepwise.transition(transition) - - def add_label(self, label): - # Doesn't set label into serializer -- subclasses override it to do that. - for action in self.moves.action_types: - added = self.moves.add_action(action, label) - if added: - # Important that the labels be stored as a list! We need the - # order, or the model goes out of synch - self.cfg.setdefault('extra_labels', []).append(label) - - -cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1: - if prob <= 0 or prob >= 1.: - return 0 - cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat) - cdef double* probs = &py_probs[0] - for i in range(nr_feat): - if probs[i] >= prob: - feats[i].value /= prob - else: - feats[i].value = 0. - - -cdef class StepwiseState: - cdef readonly StateClass stcls - cdef readonly Example eg - cdef readonly Doc doc - cdef readonly GoldParse gold - cdef readonly Parser parser - - def __init__(self, Parser parser, Doc doc, GoldParse gold=None): - self.parser = parser - self.doc = doc - if gold is not None: - self.gold = gold - self.parser.moves.preprocess_gold(self.gold) - else: - self.gold = GoldParse(doc) - self.stcls = StateClass.init(doc.c, doc.length) - self.parser.moves.initialize_state(self.stcls.c) - self.eg = Example( - nr_class=self.parser.moves.n_moves, - nr_atom=CONTEXT_SIZE, - nr_feat=self.parser.model.nr_feat) - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.finish() - - @property - def is_final(self): - return self.stcls.is_final() - - @property - def stack(self): - return self.stcls.stack - - @property - def queue(self): - return self.stcls.queue - - @property - def heads(self): - return [self.stcls.H(i) for i in range(self.stcls.c.length)] - - @property - def deps(self): - return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] - for i in range(self.stcls.c.length)] - - @property - def costs(self): - """ - Find the action-costs for the current state. - """ - if not self.gold: - raise ValueError("Can't set costs: No GoldParse provided") - self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, - self.stcls, self.gold) - costs = {} - for i in range(self.parser.moves.n_moves): - if not self.eg.c.is_valid[i]: - continue - transition = self.parser.moves.c[i] - name = self.parser.moves.move_name(transition.move, transition.label) - costs[name] = self.eg.c.costs[i] - return costs - - def predict(self): - self.eg.reset() - self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, - self.stcls.c) - self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c) - self.parser.model.set_scoresC(self.eg.c.scores, - self.eg.c.features, self.eg.c.nr_feat) - - cdef Transition action = self.parser.moves.c[self.eg.guess] - return self.parser.moves.move_name(action.move, action.label) - - def transition(self, action_name=None): - if action_name is None: - action_name = self.predict() - moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} - if action_name == '_': - action_name = self.predict() - action = self.parser.moves.lookup_transition(action_name) - elif action_name == 'L' or action_name == 'R': - self.predict() - move = moves[action_name] - clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, - self.eg.c.nr_class) - action = self.parser.moves.c[clas] - else: - action = self.parser.moves.lookup_transition(action_name) - action.do(self.stcls.c, action.label) - - def finish(self): - if self.stcls.is_final(): - self.parser.moves.finalize_state(self.stcls.c) - self.doc.set_parse(self.stcls.c._sent) - self.parser.moves.finalize_doc(self.doc) - - -class ParserStateError(ValueError): - def __init__(self, doc): - ValueError.__init__(self, - "Error analysing doc -- no valid actions available. This should " - "never happen, so please report the error on the issue tracker. " - "Here's the thread to do so --- reopen it if it's closed:\n" - "https://github.com/spacy-io/spaCy/issues/429\n" - "Please include the text that the parser failed on, which is:\n" - "%s" % repr(doc.text)) - -cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil: - cdef int best = -1 - for i in range(n): - if costs[i] <= 0: - if best == -1 or scores[i] > scores[best]: - best = i - return best - - -cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, - int nr_class) except -1: - cdef weight_t score = 0 - cdef int mode = -1 - cdef int i - for i in range(nr_class): - if actions[i].move == move and (mode == -1 or scores[i] >= score): - mode = i - score = scores[i] - return mode From 96b4214303a957e4b81c77f8bdc3d14c6f778318 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 12:57:32 +0200 Subject: [PATCH 44/99] Add notes on pipe template inheritance in docs --- website/api/dependencyparser.jade | 1 + website/api/entityrecognizer.jade | 1 + website/api/tagger.jade | 1 + website/api/tensorizer.jade | 1 + website/api/textcategorizer.jade | 1 + 5 files changed, 5 insertions(+) diff --git a/website/api/dependencyparser.jade b/website/api/dependencyparser.jade index ca56d6816..e557ef9da 100644 --- a/website/api/dependencyparser.jade +++ b/website/api/dependencyparser.jade @@ -2,4 +2,5 @@ include ../_includes/_mixins +//- This class inherits from Pipe, so this page uses the template in pipe.jade. !=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" }) diff --git a/website/api/entityrecognizer.jade b/website/api/entityrecognizer.jade index aff33bde7..a8b68e453 100644 --- a/website/api/entityrecognizer.jade +++ b/website/api/entityrecognizer.jade @@ -2,4 +2,5 @@ include ../_includes/_mixins +//- This class inherits from Pipe, so this page uses the template in pipe.jade. !=partial("pipe", { subclass: "EntityRecognizer", short: "ner", pipeline_id: "ner" }) diff --git a/website/api/tagger.jade b/website/api/tagger.jade index 4c8ce916f..7a7e9214f 100644 --- a/website/api/tagger.jade +++ b/website/api/tagger.jade @@ -2,4 +2,5 @@ include ../_includes/_mixins +//- This class inherits from Pipe, so this page uses the template in pipe.jade. !=partial("pipe", { subclass: "Tagger", pipeline_id: "tagger" }) diff --git a/website/api/tensorizer.jade b/website/api/tensorizer.jade index b54e20514..cc79f36e3 100644 --- a/website/api/tensorizer.jade +++ b/website/api/tensorizer.jade @@ -2,4 +2,5 @@ include ../_includes/_mixins +//- This class inherits from Pipe, so this page uses the template in pipe.jade. !=partial("pipe", { subclass: "Tensorizer", pipeline_id: "tensorizer" }) diff --git a/website/api/textcategorizer.jade b/website/api/textcategorizer.jade index 2d550f699..a9684b15d 100644 --- a/website/api/textcategorizer.jade +++ b/website/api/textcategorizer.jade @@ -16,4 +16,5 @@ p | before a logistic activation is applied elementwise. The value of each | output neuron is the probability that some class is present. +//- This class inherits from Pipe, so this page uses the template in pipe.jade. !=partial("pipe", { subclass: "TextCategorizer", short: "textcat", pipeline_id: "textcat" }) From 9bf78d5fb3a638f4463a21fc7439e9edf1dba04b Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 13:04:25 +0200 Subject: [PATCH 45/99] Update spacy.explain docs --- spacy/glossary.py | 10 ++++++++++ website/api/_top-level/_spacy.jade | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/spacy/glossary.py b/spacy/glossary.py index ed1c22c21..55f337b1a 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -3,6 +3,16 @@ from __future__ import unicode_literals def explain(term): + """Get a description for a given POS tag, dependency label or entity type. + + term (unicode): The term to explain. + RETURNS (unicode): The explanation, or `None` if not found in the glossary. + + EXAMPLE: + >>> spacy.explain(u'NORP') + >>> doc = nlp(u'Hello world') + >>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc]) + """ if term in GLOSSARY: return GLOSSARY[term] diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade index 81ec744ad..81612c5e6 100644 --- a/website/api/_top-level/_spacy.jade +++ b/website/api/_top-level/_spacy.jade @@ -136,7 +136,7 @@ p | #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]]. +aside-code("Example"). - spacy.explain('NORP') + spacy.explain(u'NORP') # Nationalities or religious or political groups doc = nlp(u'Hello world') From 6f78e29bed2d226ebaf316f16fc329c0c07371c3 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 13:04:35 +0200 Subject: [PATCH 46/99] Add LAW entity label to glossary --- spacy/glossary.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/glossary.py b/spacy/glossary.py index 55f337b1a..fd74d85e7 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -293,6 +293,7 @@ GLOSSARY = { 'PRODUCT': 'Objects, vehicles, foods, etc. (not services)', 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.', 'WORK_OF_ART': 'Titles of books, songs, etc.', + 'LAW': 'Named documents made into laws.', 'LANGUAGE': 'Any named language', 'DATE': 'Absolute or relative dates or periods', 'TIME': 'Times smaller than a day', From 90d1d9b230522124eaefba5172ac28b5b708a215 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 13:22:45 +0200 Subject: [PATCH 47/99] Remove obsolete parser code --- setup.py | 5 ----- spacy/language.py | 1 - spacy/syntax/nn_parser.pyx | 3 --- 3 files changed, 9 deletions(-) diff --git a/setup.py b/setup.py index 2e2b816b7..f7525a3ff 100755 --- a/setup.py +++ b/setup.py @@ -30,19 +30,14 @@ MOD_NAMES = [ 'spacy.syntax._state', 'spacy.syntax._beam_utils', 'spacy.tokenizer', - 'spacy._cfile', - 'spacy.syntax.parser', 'spacy.syntax.nn_parser', - 'spacy.syntax.beam_parser', 'spacy.syntax.nonproj', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', - 'spacy.syntax._parse_features', 'spacy.gold', 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', - 'spacy.cfile', 'spacy.matcher', 'spacy.syntax.ner', 'spacy.symbols', diff --git a/spacy/language.py b/spacy/language.py index c4777898e..34bc49263 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -16,7 +16,6 @@ from .tokenizer import Tokenizer from .vocab import Vocab from .tagger import Tagger from .lemmatizer import Lemmatizer -from .syntax.parser import get_templates from .pipeline import DependencyParser, Tensorizer, Tagger from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 913d2365f..c592cdc22 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -54,9 +54,6 @@ from .._ml import link_vectors_to_models from .._ml import HistoryFeatures from ..compat import json_dumps, copy_array -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC from . import nonproj From ea03f1ef6431791700aa8458d720de94a31cb68b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 13:23:36 +0200 Subject: [PATCH 48/99] Remove obsolete cfile code --- spacy/_cfile.pxd | 26 ------------ spacy/_cfile.pyx | 88 ---------------------------------------- spacy/cfile.pxd | 33 --------------- spacy/cfile.pyx | 103 ----------------------------------------------- 4 files changed, 250 deletions(-) delete mode 100644 spacy/_cfile.pxd delete mode 100644 spacy/_cfile.pyx delete mode 100644 spacy/cfile.pxd delete mode 100644 spacy/cfile.pyx diff --git a/spacy/_cfile.pxd b/spacy/_cfile.pxd deleted file mode 100644 index cb0077587..000000000 --- a/spacy/_cfile.pxd +++ /dev/null @@ -1,26 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from cymem.cymem cimport Pool - -cdef class CFile: - cdef FILE* fp - cdef bint is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * - - - -cdef class StringCFile(CFile): - cdef unsigned char* data - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/spacy/_cfile.pyx b/spacy/_cfile.pyx deleted file mode 100644 index ceebe2e59..000000000 --- a/spacy/_cfile.pyx +++ /dev/null @@ -1,88 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from libc.string cimport memcpy - - -cdef class CFile: - def __init__(self, loc, mode, on_open_error=None): - if isinstance(mode, unicode): - mode_str = mode.encode('ascii') - else: - mode_str = mode - if hasattr(loc, 'as_posix'): - loc = loc.as_posix() - self.mem = Pool() - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - self.fp = fopen(bytes_loc, mode_str) - if self.fp == NULL: - if on_open_error is not None: - on_open_error() - else: - raise IOError("Could not open binary file %s" % bytes_loc) - self.is_open = True - - def __dealloc__(self): - if self.is_open: - fclose(self.fp) - - def close(self): - fclose(self.fp) - self.is_open = False - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - st = fread(dest, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: - st = fwrite(src, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) - - -cdef class StringCFile: - def __init__(self, mode, bytes data=b'', on_open_error=None): - self.mem = Pool() - self.is_open = 'w' in mode - self._capacity = max(len(data), 8) - self.size = len(data) - self.data = self.mem.alloc(1, self._capacity) - for i in range(len(data)): - self.data[i] = data[i] - - def close(self): - self.is_open = False - - def string_data(self): - return (self.data-self.size)[:self.size] - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - memcpy(dest, self.data, elem_size * number) - self.data += elem_size * number - - cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: - write_size = number * elem_size - if (self.size + write_size) >= self._capacity: - self._capacity = (self.size + write_size) * 2 - self.data = self.mem.realloc(self.data, self._capacity) - memcpy(&self.data[self.size], src, elem_size * number) - self.size += write_size - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) diff --git a/spacy/cfile.pxd b/spacy/cfile.pxd deleted file mode 100644 index b95fbb2be..000000000 --- a/spacy/cfile.pxd +++ /dev/null @@ -1,33 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from cymem.cymem cimport Pool - -cdef class CFile: - cdef FILE* fp - cdef unsigned char* data - cdef int is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int i # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * - - - -cdef class StringCFile: - cdef unsigned char* data - cdef int is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int i # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx deleted file mode 100644 index 006ff78ac..000000000 --- a/spacy/cfile.pyx +++ /dev/null @@ -1,103 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from libc.stdio cimport fopen, fclose, fread, fwrite -from libc.string cimport memcpy - - -cdef class CFile: - def __init__(self, loc, mode, on_open_error=None): - if isinstance(mode, unicode): - mode_str = mode.encode('ascii') - else: - mode_str = mode - if hasattr(loc, 'as_posix'): - loc = loc.as_posix() - self.mem = Pool() - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - self.fp = fopen(bytes_loc, mode_str) - if self.fp == NULL: - if on_open_error is not None: - on_open_error() - else: - raise IOError("Could not open binary file %s" % bytes_loc) - self.is_open = True - - def __dealloc__(self): - if self.is_open: - fclose(self.fp) - - def close(self): - fclose(self.fp) - self.is_open = False - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - st = fread(dest, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: - st = fwrite(src, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) - - -cdef class StringCFile: - def __init__(self, bytes data, mode, on_open_error=None): - self.mem = Pool() - self.is_open = 1 if 'w' in mode else 0 - self._capacity = max(len(data), 8) - self.size = len(data) - self.i = 0 - self.data = self.mem.alloc(1, self._capacity) - for i in range(len(data)): - self.data[i] = data[i] - - def __dealloc__(self): - # Important to override this -- or - # we try to close a non-existant file pointer! - pass - - def close(self): - self.is_open = False - - def string_data(self): - cdef bytes byte_string = b'\0' * (self.size) - bytes_ptr = byte_string - for i in range(self.size): - bytes_ptr[i] = self.data[i] - print(byte_string) - return byte_string - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - if self.i+(number * elem_size) < self.size: - memcpy(dest, &self.data[self.i], elem_size * number) - self.i += elem_size * number - - cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: - write_size = number * elem_size - if (self.size + write_size) >= self._capacity: - self._capacity = (self.size + write_size) * 2 - self.data = self.mem.realloc(self.data, self._capacity) - memcpy(&self.data[self.size], src, write_size) - self.size += write_size - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) From c52671420c7b2554274009faa976a2788dc16d13 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 13:28:19 +0200 Subject: [PATCH 49/99] Remove old cfile import --- spacy/vocab.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bcd1f3c10..193509771 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -16,7 +16,6 @@ from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string from .typedefs cimport attr_t -from .cfile cimport CFile from .tokens.token cimport Token from .attrs cimport PROB, LANG from .structs cimport SerializedLexemeC From c30258c3a2635e21f6e6f3c8ed7cb314a431794e Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 14:23:52 +0200 Subject: [PATCH 50/99] Remove old example --- examples/training/train_ner_standalone.py | 206 ---------------------- 1 file changed, 206 deletions(-) delete mode 100644 examples/training/train_ner_standalone.py diff --git a/examples/training/train_ner_standalone.py b/examples/training/train_ner_standalone.py deleted file mode 100644 index 0c5094bb7..000000000 --- a/examples/training/train_ner_standalone.py +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/env python -'''Example of training a named entity recognition system from scratch using spaCy - -This example is written to be self-contained and reasonably transparent. -To achieve that, it duplicates some of spaCy's internal functionality. - -Specifically, in this example, we don't use spaCy's built-in Language class to -wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write -our own simple Pipeline class, so that it's easier to see how the pieces -interact. - -Input data: -https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip - -Developed for: spaCy 1.7.1 -Last tested for: spaCy 2.0.0a13 -''' -from __future__ import unicode_literals, print_function -import plac -from pathlib import Path -import random -import json -import tqdm - -from thinc.neural.optimizers import Adam -from thinc.neural.ops import NumpyOps - -from spacy.vocab import Vocab -from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer -from spacy.tokenizer import Tokenizer -from spacy.tokens import Doc -from spacy.attrs import * -from spacy.gold import GoldParse -from spacy.gold import iob_to_biluo -from spacy.gold import minibatch -from spacy.scorer import Scorer -import spacy.util - - -try: - unicode -except NameError: - unicode = str - - -spacy.util.set_env_log(True) - - -def init_vocab(): - return Vocab( - lex_attr_getters={ - LOWER: lambda string: string.lower(), - NORM: lambda string: string.lower(), - PREFIX: lambda string: string[0], - SUFFIX: lambda string: string[-3:], - }) - - -class Pipeline(object): - def __init__(self, vocab=None, tokenizer=None, entity=None): - if vocab is None: - vocab = init_vocab() - if tokenizer is None: - tokenizer = Tokenizer(vocab, {}, None, None, None) - if entity is None: - entity = NeuralEntityRecognizer(vocab) - self.vocab = vocab - self.tokenizer = tokenizer - self.entity = entity - self.pipeline = [self.entity] - - def begin_training(self): - for model in self.pipeline: - model.begin_training([]) - optimizer = Adam(NumpyOps(), 0.001) - return optimizer - - def __call__(self, input_): - doc = self.make_doc(input_) - for process in self.pipeline: - process(doc) - return doc - - def make_doc(self, input_): - if isinstance(input_, bytes): - input_ = input_.decode('utf8') - if isinstance(input_, unicode): - return self.tokenizer(input_) - else: - return Doc(self.vocab, words=input_) - - def make_gold(self, input_, annotations): - doc = self.make_doc(input_) - gold = GoldParse(doc, entities=annotations) - return gold - - def update(self, inputs, annots, sgd, losses=None, drop=0.): - if losses is None: - losses = {} - docs = [self.make_doc(input_) for input_ in inputs] - golds = [self.make_gold(input_, annot) for input_, annot in - zip(inputs, annots)] - - self.entity.update(docs, golds, drop=drop, - sgd=sgd, losses=losses) - return losses - - def evaluate(self, examples): - scorer = Scorer() - for input_, annot in examples: - gold = self.make_gold(input_, annot) - doc = self(input_) - scorer.score(doc, gold) - return scorer.scores - - def to_disk(self, path): - path = Path(path) - if not path.exists(): - path.mkdir() - elif not path.is_dir(): - raise IOError("Can't save pipeline to %s\nNot a directory" % path) - self.vocab.to_disk(path / 'vocab') - self.entity.to_disk(path / 'ner') - - def from_disk(self, path): - path = Path(path) - if not path.exists(): - raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) - if not path.is_dir(): - raise IOError("Cannot load pipeline from %s\nNot a directory" % path) - self.vocab = self.vocab.from_disk(path / 'vocab') - self.entity = self.entity.from_disk(path / 'ner') - - -def train(nlp, train_examples, dev_examples, nr_epoch=5): - sgd = nlp.begin_training() - print("Iter", "Loss", "P", "R", "F") - for i in range(nr_epoch): - random.shuffle(train_examples) - losses = {} - for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8): - inputs, annots = zip(*batch) - nlp.update(list(inputs), list(annots), sgd, losses=losses) - scores = nlp.evaluate(dev_examples) - report_scores(i+1, losses['ner'], scores) - - -def report_scores(i, loss, scores): - precision = '%.2f' % scores['ents_p'] - recall = '%.2f' % scores['ents_r'] - f_measure = '%.2f' % scores['ents_f'] - print('Epoch %d: %d %s %s %s' % ( - i, int(loss), precision, recall, f_measure)) - - -def read_examples(path): - path = Path(path) - with path.open() as file_: - sents = file_.read().strip().split('\n\n') - for sent in sents: - sent = sent.strip() - if not sent: - continue - tokens = sent.split('\n') - while tokens and tokens[0].startswith('#'): - tokens.pop(0) - words = [] - iob = [] - for token in tokens: - if token.strip(): - pieces = token.split('\t') - words.append(pieces[1]) - iob.append(pieces[2]) - yield words, iob_to_biluo(iob) - - -def get_labels(examples): - labels = set() - for words, tags in examples: - for tag in tags: - if '-' in tag: - labels.add(tag.split('-')[1]) - return sorted(labels) - - -@plac.annotations( - model_dir=("Path to save the model", "positional", None, Path), - train_loc=("Path to your training data", "positional", None, Path), - dev_loc=("Path to your development data", "positional", None, Path), -) -def main(model_dir, train_loc, dev_loc, nr_epoch=30): - print(model_dir, train_loc, dev_loc) - train_examples = list(read_examples(train_loc)) - dev_examples = read_examples(dev_loc) - nlp = Pipeline() - for label in get_labels(train_examples): - nlp.entity.add_label(label) - print("Add label", label) - - train(nlp, train_examples, list(dev_examples), nr_epoch) - - nlp.to_disk(model_dir) - - -if __name__ == '__main__': - plac.call(main) From e904075f35dde853f4f210fb4bb1ceebe781bc55 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 14:24:00 +0200 Subject: [PATCH 51/99] Remove stray print statements --- examples/training/train_new_entity_type.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index fc550b1ed..d3bdc4dcf 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -96,8 +96,6 @@ def main(model=None, new_model_name='animal', output_dir=None): nlp.update(docs, golds, losses=losses, sgd=optimizer, drop=0.35) print(losses) - print(nlp.pipeline) - print(disabled.original_pipeline) # test the trained model test_text = 'Do you like horses?' From 9d58673aaf84ed04e40f48e1bf7eb1a0c0b20723 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 14:24:12 +0200 Subject: [PATCH 52/99] Update train_ner example for spaCy v2.0 --- examples/training/train_ner.py | 138 ++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 45 deletions(-) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index e9ae013d3..5a3e23244 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -1,13 +1,104 @@ +#!/usr/bin/env python +# coding: utf8 +""" +Example of training spaCy's named entity recognizer, starting off with an +existing model or a blank model. + +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals, print_function import random +from pathlib import Path -from spacy.lang.en import English +import spacy from spacy.gold import GoldParse, biluo_tags_from_offsets +# training data +TRAIN_DATA = [ + ('Who is Shaka Khan?', [(7, 17, 'PERSON')]), + ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) +] + + +def main(model=None, output_dir=None, n_iter=100): + """Load the model, set up the pipeline and train the entity recognizer. + + model (unicode): Model name to start off with. If None, a blank English + Language class is created. + output_dir (unicode / Path): Optional output directory. If None, no model + will be saved. + n_iter (int): Number of iterations during training. + """ + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") + + # create the built-in pipeline components and add them to the pipeline + # ner.create_pipe works for built-ins that are registered with spaCy! + if 'ner' not in nlp.pipe_names: + ner = nlp.create_pipe('ner') + nlp.add_pipe(ner, last=True) + + # function that allows begin_training to get the training data + get_data = lambda: reformat_train_data(nlp.tokenizer, TRAIN_DATA) + + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] + with nlp.disable_pipes(*other_pipes) as disabled: # only train NER + optimizer = nlp.begin_training(get_data) + for itn in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for raw_text, entity_offsets in TRAIN_DATA: + doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + nlp.update( + [doc], # Batch of Doc objects + [gold], # Batch of GoldParse objects + drop=0.5, # Dropout -- make it harder to memorise data + sgd=optimizer, # Callable to update weights + losses=losses) + print(losses) + + # test the trained model + for text, _ in TRAIN_DATA: + doc = nlp(text) + print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) + print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + for text, _ in TRAIN_DATA: + doc = nlp(text) + print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) + print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) + + def reformat_train_data(tokenizer, examples): - """Reformat data to match JSON format""" + """Reformat data to match JSON format. + https://alpha.spacy.io/api/annotation#json-input + + tokenizer (Tokenizer): Tokenizer to process the raw text. + examples (list): The trainig data. + RETURNS (list): The reformatted training data.""" output = [] for i, (text, entity_offsets) in enumerate(examples): doc = tokenizer(text) @@ -21,49 +112,6 @@ def reformat_train_data(tokenizer, examples): return output -def main(model_dir=None): - train_data = [ - ( - 'Who is Shaka Khan?', - [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] - ), - ( - 'I like London and Berlin.', - [(len('I like '), len('I like London'), 'LOC'), - (len('I like London and '), len('I like London and Berlin'), 'LOC')] - ) - ] - nlp = English(pipeline=['tensorizer', 'ner']) - get_data = lambda: reformat_train_data(nlp.tokenizer, train_data) - optimizer = nlp.begin_training(get_data) - for itn in range(100): - random.shuffle(train_data) - losses = {} - for raw_text, entity_offsets in train_data: - doc = nlp.make_doc(raw_text) - gold = GoldParse(doc, entities=entity_offsets) - nlp.update( - [doc], # Batch of Doc objects - [gold], # Batch of GoldParse objects - drop=0.5, # Dropout -- make it harder to memorise data - sgd=optimizer, # Callable to update weights - losses=losses) - print(losses) - print("Save to", model_dir) - nlp.to_disk(model_dir) - print("Load from", model_dir) - nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner']) - nlp.from_disk(model_dir) - for raw_text, _ in train_data: - doc = nlp(raw_text) - for word in doc: - print(word.text, word.ent_type_, word.ent_iob_) - if __name__ == '__main__': import plac plac.call(main) - # Who "" 2 - # is "" 2 - # Shaka "" PERSON 3 - # Khan "" PERSON 1 - # ? "" 2 From 8116d1a077cba9d32d3e4da21dcb6bd6c5356d70 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 14:44:32 +0200 Subject: [PATCH 53/99] Add note on biluo_tags_from_offsets helper --- website/api/_annotation/_training.jade | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/api/_annotation/_training.jade b/website/api/_annotation/_training.jade index 3b11eb2f5..d05bfa825 100644 --- a/website/api/_annotation/_training.jade +++ b/website/api/_annotation/_training.jade @@ -13,7 +13,9 @@ p | that are part of an entity are set to the entity label, prefixed by the | BILUO marker. For example #[code "B-ORG"] describes the first token of | a multi-token #[code ORG] entity and #[code "U-PERSON"] a single - | token representing a #[code PERSON] entity + | token representing a #[code PERSON] entity. The + | #[+api("goldparse#biluo_tags_from_offsets") #[code biluo_tags_from_offsets]] + | function can help you convert entity offsets to the right format. +code("Example structure"). [{ From 281f88a59c309f66f5b2a55c41a1418c3050142f Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 14:44:43 +0200 Subject: [PATCH 54/99] Update NER training examples --- website/usage/_training/_ner.jade | 92 ++++++++++++++++++++++--------- website/usage/examples.jade | 18 +++--- 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/website/usage/_training/_ner.jade b/website/usage/_training/_ner.jade index ed58c4c6f..12f92dbce 100644 --- a/website/usage/_training/_ner.jade +++ b/website/usage/_training/_ner.jade @@ -24,6 +24,58 @@ p | #[strong experiment on your own data] to find a solution that works best | for you. ++h(3, "example-train-ner") Updating the Named Entity Recognizer + +p + | This example shows how to update spaCy's entity recognizer + | with your own examples, starting off with an existing, pre-trained + | model, or from scratch using a blank #[code Language] class. To do + | this, you'll need #[strong example texts] and the + | #[strong character offsets] and #[strong labels] of each entity contained + | in the texts. + + +github("spacy", "examples/training/train_ner.py") + ++h(4) Step by step guide + ++list("numbers") + +item + | #[strong Reformat the training data] to match spaCy's + | #[+a("/api/annotation#json-input") JSON format]. The built-in + | #[+api("goldparse#biluo_tags_from_offsets") #[code biluo_tags_from_offsets]] + | function can help you with this. + + +item + | #[strong Load the model] you want to start with, or create an + | #[strong empty model] using + | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your + | language. If you're using a blank model, don't forget to add the + | entity recognizer to the pipeline. If you're using an existing model, + | make sure to disable all other pipeline components during training + | using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. + | This way, you'll only be training the entity recognizer. + + +item + | #[strong Shuffle and loop over] the examples and create a + | #[code Doc] and #[code GoldParse] object for each example. + + +item + | For each example, #[strong update the model] + | by calling #[+api("language#update") #[code nlp.update]], which steps + | through the words of the input. At each word, it makes a + | #[strong prediction]. It then consults the annotations provided on the + | #[code GoldParse] instance, to see whether it was + | right. If it was wrong, it adjusts its weights so that the correct + | action will score higher next time. + + +item + | #[strong Save] the trained model using + | #[+api("language#to_disk") #[code nlp.to_disk]]. + + +item + | #[strong Test] the model to make sure the entities in the training + | data are recognised correctly. + +h(3, "example-new-entity-type") Training an additional entity type p @@ -38,22 +90,22 @@ p +github("spacy", "examples/training/train_new_entity_type.py") -p Training a new entity type requires the following steps: ++h(4) Step by step guide +list("numbers") +item - | Create #[+api("doc") #[code Doc]] and - | #[+api("goldparse") #[code GoldParse]] objects for + | Create #[code Doc] and #[code GoldParse] objects for | #[strong each example in your training data]. +item | #[strong Load the model] you want to start with, or create an | #[strong empty model] using - | #[+api("spacy#blank") #[code spacy.blank()]] with the ID of your - | language. If you're using an existing model, make sure to disable - | all other pipeline components during training using - | #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. This way, - | you'll only be training the entity recognizer. + | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your + | language. If you're using a blank model, don't forget to add the + | entity recognizer to the pipeline. If you're using an existing model, + | make sure to disable all other pipeline components during training + | using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. + | This way, you'll only be training the entity recognizer. +item | #[strong Add the new entity label] to the entity recognizer using the @@ -66,28 +118,14 @@ p Training a new entity type requires the following steps: | #[+api("language#update") #[code nlp.update]], which steps through | the words of the input. At each word, it makes a | #[strong prediction]. It then consults the annotations provided on the - | #[+api("goldparse") #[code GoldParse]] instance, to see whether it was - | right. If it was wrong, it adjusts its weights so that the correct - | action will score higher next time. + | #[code GoldParse] instance, to see whether it was right. If it was + | wrong, it adjusts its weights so that the correct action will score + | higher next time. +item | #[strong Save] the trained model using - | #[+api("language#to_disk") #[code nlp.to_disk()]]. + | #[+api("language#to_disk") #[code nlp.to_disk]]. +item - | #[strong Test] the model to make sure the new entity is recognized + | #[strong Test] the model to make sure the new entity is recognised | correctly. - -+h(3, "example-ner-from-scratch") Example: Training an NER system from scratch - -p - | This example is written to be self-contained and reasonably transparent. - | To achieve that, it duplicates some of spaCy's internal functionality. - | Specifically, in this example, we don't use spaCy's built-in - | #[+api("language") #[code Language]] class to wire together the - | #[+api("vocab") #[code Vocab]], #[+api("tokenizer") #[code Tokenizer]] - | and #[+api("entityrecognizer") #[code EntityRecognizer]]. Instead, we - | write our own simle #[code Pipeline] class, so that it's easier to see - | how the pieces interact. - -+github("spacy", "examples/training/train_ner_standalone.py") diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 5dfeaf2a7..914ecafde 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -61,6 +61,15 @@ include ../_includes/_mixins +github("spacy", "examples/phrase_matcher.py") +section("training") + +h(3, "training-ner") Training spaCy's Named Entity Recognizer + + p + | This example shows how to update spaCy's entity recognizer + | with your own examples, starting off with an existing, pre-trained + | model, or from scratch using a blank #[code Language] class. + + +github("spacy", "examples/training/train_ner.py") + +h(3, "new-entity-type") Training an additional entity type p @@ -71,15 +80,6 @@ include ../_includes/_mixins +github("spacy", "examples/training/train_new_entity_type.py") - +h(3, "ner-standalone") Training an NER system from scratch - - p - | This example is written to be self-contained and reasonably - | transparent. To achieve that, it duplicates some of spaCy's internal - | functionality. - - +github("spacy", "examples/training/train_ner_standalone.py") - +h(3, "textcat") Training spaCy's text classifier +tag-new(2) From d425ede7e9e44e7fc003faf29524698a0531a1ff Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 15:15:08 +0200 Subject: [PATCH 55/99] Fix example --- examples/training/train_ner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 5a3e23244..9427f452e 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -44,7 +44,7 @@ def main(model=None, output_dir=None, n_iter=100): print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline - # ner.create_pipe works for built-ins that are registered with spaCy! + # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) @@ -86,8 +86,9 @@ def main(model=None, output_dir=None, n_iter=100): # test the saved model print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: - doc = nlp(text) + doc = nlp2(text) print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) From 586b9047fd1d2fcc750f2d9930b28a1ee0e25fff Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 15:15:26 +0200 Subject: [PATCH 56/99] Use create_pipe instead of importing the entity recognizer --- examples/training/train_new_entity_type.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index d3bdc4dcf..ea6c08763 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -34,7 +34,6 @@ from pathlib import Path import spacy from spacy.gold import GoldParse, minibatch -from spacy.pipeline import NeuralEntityRecognizer # new entity label @@ -77,10 +76,14 @@ def main(model=None, new_model_name='animal', output_dir=None): print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline + # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: - nlp.add_pipe(NeuralEntityRecognizer(nlp.vocab)) + ner = nlp.create_pipe('ner') + nlp.add_pipe(ner) + # otherwise, get it, so we can add labels to it + else: + ner = nlp.get_pipe('ner') - ner = nlp.get_pipe('ner') # get entity recognizer ner.add_label(LABEL) # add new entity label to entity recognizer # get names of other pipes to disable them during training From b5c74dbb34f035b71732e8bc37f0a43c859459ae Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 15:15:37 +0200 Subject: [PATCH 57/99] Update parser training example --- examples/training/train_parser.py | 153 +++++++++++++++++++----------- 1 file changed, 95 insertions(+), 58 deletions(-) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 8c3119704..d2c15c4c2 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -1,75 +1,112 @@ +#!/usr/bin/env python +# coding: utf8 +""" +Example of training spaCy dependency parser, starting off with an existing model +or a blank model. + +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals, print_function -import json -import pathlib + import random +from pathlib import Path import spacy -from spacy.pipeline import DependencyParser from spacy.gold import GoldParse from spacy.tokens import Doc -def train_parser(nlp, train_data, left_labels, right_labels): - parser = DependencyParser( - nlp.vocab, - left_labels=left_labels, - right_labels=right_labels) - for itn in range(1000): - random.shuffle(train_data) - loss = 0 - for words, heads, deps in train_data: - doc = Doc(nlp.vocab, words=words) - gold = GoldParse(doc, heads=heads, deps=deps) - loss += parser.update(doc, gold) - parser.model.end_training() - return parser +# training data +TRAIN_DATA = [ + ( + ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], + [1, 1, 4, 4, 5, 1, 1], + ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] + ), + ( + ['I', 'like', 'London', 'and', 'Berlin', '.'], + [1, 1, 1, 2, 2, 1], + ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] + ) +] -def main(model_dir=None): - if model_dir is not None: - model_dir = pathlib.Path(model_dir) - if not model_dir.exists(): - model_dir.mkdir() - assert model_dir.is_dir() +def main(model=None, output_dir=None, n_iter=1000): + """Load the model, set up the pipeline and train the parser. - nlp = spacy.load('en', tagger=False, parser=False, entity=False, add_vectors=False) + model (unicode): Model name to start off with. If None, a blank English + Language class is created. + output_dir (unicode / Path): Optional output directory. If None, no model + will be saved. + n_iter (int): Number of iterations during training. + """ + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") - train_data = [ - ( - ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], - [1, 1, 4, 4, 5, 1, 1], - ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] - ), - ( - ['I', 'like', 'London', 'and', 'Berlin', '.'], - [1, 1, 1, 2, 2, 1], - ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] - ) - ] - left_labels = set() - right_labels = set() - for _, heads, deps in train_data: - for i, (head, dep) in enumerate(zip(heads, deps)): - if i < head: - left_labels.add(dep) - elif i > head: - right_labels.add(dep) - parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels)) + # add the parser to the pipeline if it doesn't exist + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'parser' not in nlp.pipe_names: + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser, first=True) + # otherwise, get it, so we can add labels to it + else: + parser = nlp.get_pipe('parser') - doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.']) - parser(doc) - for word in doc: - print(word.text, word.dep_, word.head.text) + # add labels to the parser + for _, heads, deps in TRAIN_DATA: + for dep in deps: + parser.add_label(dep) - if model_dir is not None: - with (model_dir / 'config.json').open('w') as file_: - json.dump(parser.cfg, file_) - parser.model.dump(str(model_dir / 'model')) + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] + with nlp.disable_pipes(*other_pipes) as disabled: # only train parser + optimizer = nlp.begin_training(lambda: []) + for itn in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for words, heads, deps in TRAIN_DATA: + doc = Doc(nlp.vocab, words=words) + gold = GoldParse(doc, heads=heads, deps=deps) + nlp.update([doc], [gold], sgd=optimizer, losses=losses) + print(losses) + + # test the trained model + test_text = "I like securities." + doc = nlp(test_text) + print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the save model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc = nlp2(test_text) + print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) if __name__ == '__main__': - main() - # I nsubj like - # like ROOT like - # securities dobj like - # . cc securities + import plac + plac.call(main) + + # expected result: + # [ + # ('I', 'nsubj', 'like'), + # ('like', 'ROOT', 'like'), + # ('securities', 'dobj', 'like'), + # ('.', 'punct', 'like') + # ] From bc2c92f22dc7d4d92673b615f0fea75e18b0496e Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:10:56 +0200 Subject: [PATCH 58/99] Use plac annotations for arguments --- examples/training/train_ner.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 9427f452e..2e8241ffc 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -13,6 +13,7 @@ Last updated for: spaCy 2.0.0a18 """ from __future__ import unicode_literals, print_function +import plac import random from pathlib import Path @@ -27,15 +28,12 @@ TRAIN_DATA = [ ] +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) def main(model=None, output_dir=None, n_iter=100): - """Load the model, set up the pipeline and train the entity recognizer. - - model (unicode): Model name to start off with. If None, a blank English - Language class is created. - output_dir (unicode / Path): Optional output directory. If None, no model - will be saved. - n_iter (int): Number of iterations during training. - """ + """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) @@ -114,5 +112,4 @@ def reformat_train_data(tokenizer, examples): if __name__ == '__main__': - import plac plac.call(main) From c3b681e5fbe157ea70167da1e67c740e8339af6f Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:11:05 +0200 Subject: [PATCH 59/99] Use plac annotations for arguments and add n_iter --- examples/training/train_new_entity_type.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index ea6c08763..69ee20e04 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -29,6 +29,7 @@ Last updated for: spaCy 2.0.0a18 """ from __future__ import unicode_literals, print_function +import plac import random from pathlib import Path @@ -58,16 +59,13 @@ TRAIN_DATA = [ ] -def main(model=None, new_model_name='animal', output_dir=None): - """Set up the pipeline and entity recognizer, and train the new entity. - - model (unicode): Model name to start off with. If None, a blank English - Language class is created. - new_model_name (unicode): Name of new model to create. Will be added to the - model meta and prefixed by the language code, e.g. 'en_animal'. - output_dir (unicode / Path): Optional output directory. If None, no model - will be saved. - """ +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + new_model_name=("New model name for model meta.", "option", "nm", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(model=None, new_model_name='animal', output_dir=None, n_iter=50): + """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) @@ -91,7 +89,7 @@ def main(model=None, new_model_name='animal', output_dir=None): with nlp.disable_pipes(*other_pipes) as disabled: # only train NER random.seed(0) optimizer = nlp.begin_training(lambda: []) - for itn in range(50): + for itn in range(n_iter): losses = {} gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA) for batch in minibatch(gold_parses, size=3): @@ -139,5 +137,4 @@ def get_gold_parses(tokenizer, train_data): if __name__ == '__main__': - import plac plac.call(main) From 4d896171ae43a4faba1b3c5cf480e641beb84cf3 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:11:20 +0200 Subject: [PATCH 60/99] Use plac annotations for arguments --- examples/training/train_parser.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index d2c15c4c2..ad39ab7c3 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -13,6 +13,7 @@ Last updated for: spaCy 2.0.0a18 """ from __future__ import unicode_literals, print_function +import plac import random from pathlib import Path @@ -36,15 +37,12 @@ TRAIN_DATA = [ ] +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) def main(model=None, output_dir=None, n_iter=1000): - """Load the model, set up the pipeline and train the parser. - - model (unicode): Model name to start off with. If None, a blank English - Language class is created. - output_dir (unicode / Path): Optional output directory. If None, no model - will be saved. - n_iter (int): Number of iterations during training. - """ + """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) @@ -100,7 +98,6 @@ def main(model=None, output_dir=None, n_iter=1000): if __name__ == '__main__': - import plac plac.call(main) # expected result: From 421c3837e83c2322a2addb52cf8d293af18b54ad Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:11:25 +0200 Subject: [PATCH 61/99] Fix formatting --- examples/training/train_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index ad39ab7c3..8cd602bcd 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -60,7 +60,7 @@ def main(model=None, output_dir=None, n_iter=1000): parser = nlp.get_pipe('parser') # add labels to the parser - for _, heads, deps in TRAIN_DATA: + for _, _, deps in TRAIN_DATA: for dep in deps: parser.add_label(dep) From 9e372913e046f81ca3da4b5d6b4f92c6b5e6346e Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:11:57 +0200 Subject: [PATCH 62/99] Remove old 'SP' condition in tag map --- spacy/pipeline.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 7c1976dfa..14ebe0301 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -420,8 +420,6 @@ class NeuralTagger(BaseThincComponent): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} - if 'SP' not in new_tag_map: - new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X}) cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, From 2d6ec998842ea5773f9e66c6153b5b9ceb7a5c0a Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:12:23 +0200 Subject: [PATCH 63/99] Set 'model' as default model name to prevent meta.json errors --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index b836b8619..9ced836f0 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -151,7 +151,7 @@ class Language(object): @property def meta(self): self._meta.setdefault('lang', self.vocab.lang) - self._meta.setdefault('name', '') + self._meta.setdefault('name', 'model') self._meta.setdefault('version', '0.0.0') self._meta.setdefault('spacy_version', about.__version__) self._meta.setdefault('description', '') From 0575e9cf207b3986a8369bfe2cb1e240bf188917 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:12:34 +0200 Subject: [PATCH 64/99] Add parser example to docs --- website/usage/_training/_tagger-parser.jade | 52 ++++++++++++++++++++- website/usage/examples.jade | 9 ++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade index a62b9d43e..437ded9c9 100644 --- a/website/usage/_training/_tagger-parser.jade +++ b/website/usage/_training/_tagger-parser.jade @@ -1,6 +1,56 @@ //- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER -+under-construction ++h(3, "example-train-parser") Updating the parser + +p + | This example shows how to train spaCy's dependency parser, starting off + | with an existing model or a blank model. You'll need a set of + | #[strong training examples] and the respective #[strong heads] and + | #[strong dependency label] for each token of the example texts. + ++github("spacy", "examples/training/train_parser.py") + ++h(4) Step by step guide + ++list("numbers") + +item + | #[strong Load the model] you want to start with, or create an + | #[strong empty model] using + | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your + | language. If you're using a blank model, don't forget to add the + | parser to the pipeline. If you're using an existing model, + | make sure to disable all other pipeline components during training + | using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. + | This way, you'll only be training the parser. + + +item + | #[strong Add the dependency labels] to the parser using the + | #[+api("dependencyparser#add_label") #[code add_label]] method. If + | you're starting off with a pre-trained spaCy model, this is usually + | not necessary – but it doesn't hurt either, just to be safe. + + +item + | #[strong Shuffle and loop over] the examples and create a + | #[code Doc] and #[code GoldParse] object for each example. Make sure + | to pass in the #[code heads] and #[code deps] when you create the + | #[code GoldParse]. + + +item + | For each example, #[strong update the model] + | by calling #[+api("language#update") #[code nlp.update]], which steps + | through the words of the input. At each word, it makes a + | #[strong prediction]. It then consults the annotations provided on the + | #[code GoldParse] instance, to see whether it was + | right. If it was wrong, it adjusts its weights so that the correct + | action will score higher next time. + + +item + | #[strong Save] the trained model using + | #[+api("language#to_disk") #[code nlp.to_disk]]. + + +item + | #[strong Test] the model to make sure the parser works as expected. + +h(3, "training-json") JSON format for training diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 914ecafde..d6ad8bc23 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -80,6 +80,15 @@ include ../_includes/_mixins +github("spacy", "examples/training/train_new_entity_type.py") + +h(3, "parser") Training spaCy's parser + + p + | This example shows how to update spaCy's dependency parser, + | starting off with an existing, pre-trained model, or from scratch + | using a blank #[code Language] class. + + +github("spacy", "examples/training/train_parser.py") + +h(3, "textcat") Training spaCy's text classifier +tag-new(2) From e44bbb53616e07ffcf855e7dea7bee9e3011d9da Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:12:41 +0200 Subject: [PATCH 65/99] Remove old example --- examples/training/load_ner.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 examples/training/load_ner.py diff --git a/examples/training/load_ner.py b/examples/training/load_ner.py deleted file mode 100644 index bf81cee50..000000000 --- a/examples/training/load_ner.py +++ /dev/null @@ -1,22 +0,0 @@ -# Load NER -from __future__ import unicode_literals -import spacy -import pathlib -from spacy.pipeline import EntityRecognizer -from spacy.vocab import Vocab - -def load_model(model_dir): - model_dir = pathlib.Path(model_dir) - nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) - with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: - nlp.vocab.strings.load(file_) - nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') - ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True) - return (nlp, ner) - -(nlp, ner) = load_model('ner') -doc = nlp.make_doc('Who is Shaka Khan?') -nlp.tagger(doc) -ner(doc) -for word in doc: - print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) From f1529463a80d9380c525e8870cda42e089801b38 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:19:02 +0200 Subject: [PATCH 66/99] Update tagger training example --- examples/training/train_tagger.py | 110 +++++++++++++++++------------- 1 file changed, 63 insertions(+), 47 deletions(-) diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index d5a519942..6b1fbcae8 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -1,18 +1,21 @@ -"""A quick example for training a part-of-speech tagger, without worrying -about the tokenization, or other language-specific customizations.""" - -from __future__ import unicode_literals -from __future__ import print_function +#!/usr/bin/env python +# coding: utf8 +""" +A simple example for training a part-of-speech tagger with a custom tag map. +To allow us to update the tag map with our custom one, this example starts off +with a blank Language class and modifies its defaults. +""" +from __future__ import unicode_literals, print_function import plac +import random from pathlib import Path -from spacy.vocab import Vocab -from spacy.tagger import Tagger +import spacy +from spacy.util import get_lang_class from spacy.tokens import Doc from spacy.gold import GoldParse -import random # You need to define a mapping from your data's part-of-speech tag names to the # Universal Part-of-Speech tag set, as spaCy includes an enum of these tags. @@ -28,54 +31,67 @@ TAG_MAP = { # Usually you'll read this in, of course. Data formats vary. # Ensure your strings are unicode. -DATA = [ - ( - ["I", "like", "green", "eggs"], - ["N", "V", "J", "N"] - ), - ( - ["Eat", "blue", "ham"], - ["V", "J", "N"] - ) +TRAIN_DATA = [ + (["I", "like", "green", "eggs"], ["N", "V", "J", "N"]), + (["Eat", "blue", "ham"], ["V", "J", "N"]) ] -def ensure_dir(path): - if not path.exists(): - path.mkdir() +@plac.annotations( + lang=("ISO Code of language to use", "option", "l", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(lang='en', output_dir=None, n_iter=25): + """Create a new model, set up the pipeline and train the tagger. In order to + train the tagger with a custom tag map, we're creating a new Language + instance with a custom vocab. + """ + lang_cls = get_lang_class(lang) # get Language class + lang_cls.Defaults.tag_map.update(TAG_MAP) # add tag map to defaults + nlp = lang_cls() # initialise Language class + # add the parser to the pipeline + # nlp.create_pipe works for built-ins that are registered with spaCy + tagger = nlp.create_pipe('tagger') + nlp.add_pipe(tagger) -def main(output_dir=None): + optimizer = nlp.begin_training(lambda: []) + for i in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for words, tags in TRAIN_DATA: + doc = Doc(nlp.vocab, words=words) + gold = GoldParse(doc, tags=tags) + nlp.update([doc], [gold], sgd=optimizer, losses=losses) + print(losses) + + # test the trained model + test_text = "I like blue eggs" + doc = nlp(test_text) + print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) + + # save model to output directory if output_dir is not None: output_dir = Path(output_dir) - ensure_dir(output_dir) - ensure_dir(output_dir / "pos") - ensure_dir(output_dir / "vocab") + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) - vocab = Vocab(tag_map=TAG_MAP) - # The default_templates argument is where features are specified. See - # spacy/tagger.pyx for the defaults. - tagger = Tagger(vocab) - for i in range(25): - for words, tags in DATA: - doc = Doc(vocab, words=words) - gold = GoldParse(doc, tags=tags) - tagger.update(doc, gold) - random.shuffle(DATA) - tagger.model.end_training() - doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4)) - tagger(doc) - for word in doc: - print(word.text, word.tag_, word.pos_) - if output_dir is not None: - tagger.model.dump(str(output_dir / 'pos' / 'model')) - with (output_dir / 'vocab' / 'strings.json').open('w') as file_: - tagger.vocab.strings.dump(file_) + # test the save model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc = nlp2(test_text) + print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) if __name__ == '__main__': plac.call(main) - # I V VERB - # like V VERB - # blue N NOUN - # eggs N NOUN + + # Expected output: + # [ + # ('I', 'N', 'NOUN'), + # ('like', 'V', 'VERB'), + # ('blue', 'J', 'ADJ'), + # ('eggs', 'N', 'NOUN') + # ] From b90e95897548f1f17b3f7607ffaeb544b8edde7b Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:27:42 +0200 Subject: [PATCH 67/99] Update tagger and parser examples and add to docs --- examples/training/train_tagger.py | 2 +- website/usage/_training/_tagger-parser.jade | 45 ++++++++++++++++++++- website/usage/examples.jade | 11 ++++- 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 6b1fbcae8..b887b4592 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -50,7 +50,7 @@ def main(lang='en', output_dir=None, n_iter=25): lang_cls.Defaults.tag_map.update(TAG_MAP) # add tag map to defaults nlp = lang_cls() # initialise Language class - # add the parser to the pipeline + # add the tagger to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy tagger = nlp.create_pipe('tagger') nlp.add_pipe(tagger) diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade index 437ded9c9..c32577a73 100644 --- a/website/usage/_training/_tagger-parser.jade +++ b/website/usage/_training/_tagger-parser.jade @@ -1,6 +1,6 @@ //- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER -+h(3, "example-train-parser") Updating the parser ++h(3, "example-train-parser") Updating the Dependency Parser p | This example shows how to train spaCy's dependency parser, starting off @@ -51,6 +51,49 @@ p +item | #[strong Test] the model to make sure the parser works as expected. ++h(3, "example-train-tagger") Updating the Part-of-speech Tagger + +p + | In this example, we're training spaCy's part-of-speech tagger with a + | custom tag map. We start off with a blank #[code Language] class, update + | its defaults with our custom tags and then train the tagger. You'll need + | a set of #[strong training examples] and the respective + | #[strong custom tags], as well as a dictionary mapping those tags to the + | #[+a("http://universaldependencies.github.io/docs/u/pos/index.html") Universal Dependencies scheme]. + ++github("spacy", "examples/training/train_tagger.py") + ++h(4) Step by step guide + ++list("numbers") + +item + | #[strong Create] a new #[code Language] class and before initialising + | it, update the #[code tag_map] in its #[code Defaults] with your + | custom tags. + + +item + | #[strong Create a new tagger] component and add it to the pipeline. + + +item + | #[strong Shuffle and loop over] the examples and create a + | #[code Doc] and #[code GoldParse] object for each example. Make sure + | to pass in the #[code tags] when you create the #[code GoldParse]. + + +item + | For each example, #[strong update the model] + | by calling #[+api("language#update") #[code nlp.update]], which steps + | through the words of the input. At each word, it makes a + | #[strong prediction]. It then consults the annotations provided on the + | #[code GoldParse] instance, to see whether it was + | right. If it was wrong, it adjusts its weights so that the correct + | action will score higher next time. + + +item + | #[strong Save] the trained model using + | #[+api("language#to_disk") #[code nlp.to_disk]]. + + +item + | #[strong Test] the model to make sure the parser works as expected. +h(3, "training-json") JSON format for training diff --git a/website/usage/examples.jade b/website/usage/examples.jade index d6ad8bc23..6641a83c6 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -80,7 +80,7 @@ include ../_includes/_mixins +github("spacy", "examples/training/train_new_entity_type.py") - +h(3, "parser") Training spaCy's parser + +h(3, "parser") Training spaCy's Dependency Parser p | This example shows how to update spaCy's dependency parser, @@ -89,6 +89,15 @@ include ../_includes/_mixins +github("spacy", "examples/training/train_parser.py") + +h(3, "tagger") Training spaCy's Part-of-speech Tagger + + p + | In this example, we're training spaCy's part-of-speech tagger with a + | custom tag map, mapping our own tags to the mapping those tags to the + | #[+a("http://universaldependencies.github.io/docs/u/pos/index.html") Universal Dependencies scheme]. + + +github("spacy", "examples/training/train_tagger.py") + +h(3, "textcat") Training spaCy's text classifier +tag-new(2) From f57043e6fe091ebaf2f4a1220215a8bb7a4b5099 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 16:29:08 +0200 Subject: [PATCH 68/99] Update docstring --- examples/training/train_tagger.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index b887b4592..c6fc1de88 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -4,6 +4,13 @@ A simple example for training a part-of-speech tagger with a custom tag map. To allow us to update the tag map with our custom one, this example starts off with a blank Language class and modifies its defaults. + +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 """ from __future__ import unicode_literals, print_function From bca5372fb16b15c1d2bc01b3cd866c15ba20bba7 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 17:32:59 +0200 Subject: [PATCH 69/99] Clean up examples --- examples/phrase_matcher.py | 60 +++++++++++++++++++---------------- examples/vectors_fast_text.py | 14 ++++---- 2 files changed, 41 insertions(+), 33 deletions(-) diff --git a/examples/phrase_matcher.py b/examples/phrase_matcher.py index ca9b0cc92..2dd2691b9 100644 --- a/examples/phrase_matcher.py +++ b/examples/phrase_matcher.py @@ -4,22 +4,24 @@ The idea is to associate each word in the vocabulary with a tag, noting whether they begin, end, or are inside at least one pattern. An additional tag is used for single-word patterns. Complete patterns are also stored in a hash set. -When we process a document, we look up the words in the vocabulary, to associate -the words with the tags. We then search for tag-sequences that correspond to -valid candidates. Finally, we look up the candidates in the hash set. +When we process a document, we look up the words in the vocabulary, to +associate the words with the tags. We then search for tag-sequences that +correspond to valid candidates. Finally, we look up the candidates in the hash +set. -For instance, to search for the phrases "Barack Hussein Obama" and "Hilary Clinton", we -would associate "Barack" and "Hilary" with the B tag, Hussein with the I tag, -and Obama and Clinton with the L tag. +For instance, to search for the phrases "Barack Hussein Obama" and "Hilary +Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with +the I tag, and Obama and Clinton with the L tag. The document "Barack Clinton and Hilary Clinton" would have the tag sequence -[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second candidate -is in the phrase dictionary, so only one is returned as a match. +[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second +candidate is in the phrase dictionary, so only one is returned as a match. -The algorithm is O(n) at run-time for document of length n because we're only ever -matching over the tag patterns. So no matter how many phrases we're looking for, -our pattern set stays very small (exact size depends on the maximum length we're -looking for, as the query language currently has no quantifiers) +The algorithm is O(n) at run-time for document of length n because we're only +ever matching over the tag patterns. So no matter how many phrases we're +looking for, our pattern set stays very small (exact size depends on the +maximum length we're looking for, as the query language currently has no +quantifiers). The example expects a .bz2 file from the Reddit corpus, and a patterns file, formatted in jsonl as a sequence of entries like this: @@ -32,11 +34,9 @@ formatted in jsonl as a sequence of entries like this: {"text":"Argentina"} """ from __future__ import print_function, unicode_literals, division + from bz2 import BZ2File import time -import math -import codecs - import plac import ujson @@ -44,6 +44,24 @@ from spacy.matcher import PhraseMatcher import spacy +@plac.annotations( + patterns_loc=("Path to gazetteer", "positional", None, str), + text_loc=("Path to Reddit corpus file", "positional", None, str), + n=("Number of texts to read", "option", "n", int), + lang=("Language class to initialise", "option", "l", str)) +def main(patterns_loc, text_loc, n=10000, lang='en'): + nlp = spacy.blank('en') + nlp.vocab.lex_attr_getters = {} + phrases = read_gazetteer(nlp.tokenizer, patterns_loc) + count = 0 + t1 = time.time() + for ent_id, text in get_matches(nlp.tokenizer, phrases, + read_text(text_loc, n=n)): + count += 1 + t2 = time.time() + print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) + + def read_gazetteer(tokenizer, loc, n=-1): for i, line in enumerate(open(loc)): data = ujson.loads(line.strip()) @@ -75,18 +93,6 @@ def get_matches(tokenizer, phrases, texts, max_length=6): yield (ent_id, doc[start:end].text) -def main(patterns_loc, text_loc, n=10000): - nlp = spacy.blank('en') - nlp.vocab.lex_attr_getters = {} - phrases = read_gazetteer(nlp.tokenizer, patterns_loc) - count = 0 - t1 = time.time() - for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): - count += 1 - t2 = time.time() - print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) - - if __name__ == '__main__': if False: import cProfile diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py index 9aa9fda56..323d5803f 100644 --- a/examples/vectors_fast_text.py +++ b/examples/vectors_fast_text.py @@ -1,16 +1,18 @@ -'''Load vectors for a language trained using FastText - +#!/usr/bin/env python +# coding: utf8 +"""Load vectors for a language trained using FastText https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md -''' +""" from __future__ import unicode_literals import plac import numpy -import spacy.language +import from spacy.language import Language +@plac.annotations(vectors_loc=("Path to vectors", "positional", None, str)) def main(vectors_loc): - nlp = spacy.language.Language() + nlp = Language() with open(vectors_loc, 'rb') as file_: header = file_.readline() @@ -18,7 +20,7 @@ def main(vectors_loc): nlp.vocab.clear_vectors(int(nr_dim)) for line in file_: line = line.decode('utf8') - pieces = line.split() + pieces = line.split() word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) From daed7ff8fedf8d7bc202ec706eed5d53e70cef77 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 18:46:11 +0200 Subject: [PATCH 70/99] Update information extraction examples --- examples/get_parse_subregions.py | 59 ----------------- examples/information_extraction.py | 59 ----------------- .../entity_relations.py | 62 ++++++++++++++++++ .../information_extraction/parse_subtrees.py | 65 +++++++++++++++++++ .../phrase_matcher.py | 0 website/usage/_data.json | 2 +- website/usage/examples.jade | 51 +++++++++------ 7 files changed, 159 insertions(+), 139 deletions(-) delete mode 100644 examples/get_parse_subregions.py delete mode 100644 examples/information_extraction.py create mode 100644 examples/information_extraction/entity_relations.py create mode 100644 examples/information_extraction/parse_subtrees.py rename examples/{ => information_extraction}/phrase_matcher.py (100%) diff --git a/examples/get_parse_subregions.py b/examples/get_parse_subregions.py deleted file mode 100644 index 5eb4f2c77..000000000 --- a/examples/get_parse_subregions.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Issue #252 - -Question: - -In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat. - -Lets take the example sentence on https://displacy.spacy.io/displacy/index.html - -displaCy uses CSS and JavaScript to show you how computers understand language -This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: - -[displaCy] uses CSS and Javascript [to + show] -& -show you how computers understand [language] -I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function. - -def dependency_labels_to_root(token): - '''Walk up the syntactic tree, collecting the arc labels.''' - dep_labels = [] - while token.head is not token: - dep_labels.append(token.dep) - token = token.head - return dep_labels -""" -from __future__ import print_function, unicode_literals - -# Answer: -# The easiest way is to find the head of the subtree you want, and then use the -# `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the -# one that does what you're asking for most directly: - -from spacy.en import English -nlp = English() - -doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language') -for word in doc: - if word.dep_ in ('xcomp', 'ccomp'): - print(''.join(w.text_with_ws for w in word.subtree)) - -# It'd probably be better for `word.subtree` to return a `Span` object instead -# of a generator over the tokens. If you want the `Span` you can get it via the -# `.right_edge` and `.left_edge` properties. The `Span` object is nice because -# you can easily get a vector, merge it, etc. - -doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language') -for word in doc: - if word.dep_ in ('xcomp', 'ccomp'): - subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] - print(subtree_span.text, '|', subtree_span.root.text) - print(subtree_span.similarity(doc)) - print(subtree_span.similarity(subtree_span.root)) - - -# You might also want to select a head, and then select a start and end position by -# walking along its children. You could then take the `.left_edge` and `.right_edge` -# of those tokens, and use it to calculate a span. - - - diff --git a/examples/information_extraction.py b/examples/information_extraction.py deleted file mode 100644 index 19e93b499..000000000 --- a/examples/information_extraction.py +++ /dev/null @@ -1,59 +0,0 @@ -import plac - -from spacy.en import English -from spacy.parts_of_speech import NOUN -from spacy.parts_of_speech import ADP as PREP - - -def _span_to_tuple(span): - start = span[0].idx - end = span[-1].idx + len(span[-1]) - tag = span.root.tag_ - text = span.text - label = span.label_ - return (start, end, tag, text, label) - -def merge_spans(spans, doc): - # This is a bit awkward atm. What we're doing here is merging the entities, - # so that each only takes up a single token. But an entity is a Span, and - # each Span is a view into the doc. When we merge a span, we invalidate - # the other spans. This will get fixed --- but for now the solution - # is to gather the information first, before merging. - tuples = [_span_to_tuple(span) for span in spans] - for span_tuple in tuples: - doc.merge(*span_tuple) - - -def extract_currency_relations(doc): - merge_spans(doc.ents, doc) - merge_spans(doc.noun_chunks, doc) - - relations = [] - for money in filter(lambda w: w.ent_type_ == 'MONEY', doc): - if money.dep_ in ('attr', 'dobj'): - subject = [w for w in money.head.lefts if w.dep_ == 'nsubj'] - if subject: - subject = subject[0] - relations.append((subject, money)) - elif money.dep_ == 'pobj' and money.head.dep_ == 'prep': - relations.append((money.head.head, money)) - - return relations - - -def main(): - nlp = English() - texts = [ - u'Net income was $9.4 million compared to the prior year of $2.7 million.', - u'Revenue exceeded twelve billion dollars, with a loss of $1b.', - ] - - for text in texts: - doc = nlp(text) - relations = extract_currency_relations(doc) - for r1, r2 in relations: - print(r1.text, r2.ent_type_, r2.text) - - -if __name__ == '__main__': - plac.call(main) diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py new file mode 100644 index 000000000..b73dcbf3b --- /dev/null +++ b/examples/information_extraction/entity_relations.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# coding: utf8 +""" +A simple example of extracting relations between phrases and entities using +spaCy's named entity recognizer and the dependency parse. Here, we extract +money and currency values (entities labelled as MONEY) and then check the +dependency tree to find the noun phrase they are referring to – for example: +$9.4 million --> Net income. + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import plac +import spacy + + +TEXTS = [ + 'Net income was $9.4 million compared to the prior year of $2.7 million.', + 'Revenue exceeded twelve billion dollars, with a loss of $1b.', +] + + +@plac.annotations( + model=("Model to load (needs parser and NER)", "positional", None, str)) +def main(model='en_core_web_sm'): + nlp = spacy.load(model) + print("Loaded model '%s'" % model) + print("Processing %d texts" % len(TEXTS)) + + for text in TEXTS: + doc = nlp(text) + relations = extract_currency_relations(doc) + for r1, r2 in relations: + print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text)) + + +def extract_currency_relations(doc): + # merge entities and noun chunks into one token + for span in [*list(doc.ents), *list(doc.noun_chunks)]: + span.merge() + + relations = [] + for money in filter(lambda w: w.ent_type_ == 'MONEY', doc): + if money.dep_ in ('attr', 'dobj'): + subject = [w for w in money.head.lefts if w.dep_ == 'nsubj'] + if subject: + subject = subject[0] + relations.append((subject, money)) + elif money.dep_ == 'pobj' and money.head.dep_ == 'prep': + relations.append((money.head.head, money)) + return relations + + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # Net income MONEY $9.4 million + # the prior year MONEY $2.7 million + # Revenue MONEY twelve billion dollars + # a loss MONEY 1b diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py new file mode 100644 index 000000000..5963d014c --- /dev/null +++ b/examples/information_extraction/parse_subtrees.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# coding: utf8 +""" +This example shows how to navigate the parse tree including subtrees attached +to a word. + +Based on issue #252: +"In the documents and tutorials the main thing I haven't found is +examples on how to break sentences down into small sub thoughts/chunks. The +noun_chunks is handy, but having examples on using the token.head to find small +(near-complete) sentence chunks would be neat. Lets take the example sentence: +"displaCy uses CSS and JavaScript to show you how computers understand language" + +This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: +[displaCy] uses CSS and Javascript [to + show] +show you how computers understand [language] + +I'm assuming that we can use the token.head to build these groups." + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import plac +import spacy + + +@plac.annotations( + model=("Model to load", "positional", None, str)) +def main(model='en_core_web_sm'): + nlp = spacy.load(model) + print("Loaded model '%s'" % model) + + doc = nlp("displaCy uses CSS and JavaScript to show you how computers " + "understand language") + + # The easiest way is to find the head of the subtree you want, and then use + # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` + # is the one that does what you're asking for most directly: + for word in doc: + if word.dep_ in ('xcomp', 'ccomp'): + print(''.join(w.text_with_ws for w in word.subtree)) + + # It'd probably be better for `word.subtree` to return a `Span` object + # instead of a generator over the tokens. If you want the `Span` you can + # get it via the `.right_edge` and `.left_edge` properties. The `Span` + # object is nice because you can easily get a vector, merge it, etc. + for word in doc: + if word.dep_ in ('xcomp', 'ccomp'): + subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] + print(subtree_span.text, '|', subtree_span.root.text) + + # You might also want to select a head, and then select a start and end + # position by walking along its children. You could then take the + # `.left_edge` and `.right_edge` of those tokens, and use it to calculate + # a span. + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # to show you how computers understand language + # how computers understand language + # to show you how computers understand language | show + # how computers understand language | understand diff --git a/examples/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py similarity index 100% rename from examples/phrase_matcher.py rename to examples/information_extraction/phrase_matcher.py diff --git a/website/usage/_data.json b/website/usage/_data.json index cc9918631..c34b5f2b0 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -196,8 +196,8 @@ "teaser": "Full code examples you can modify and run.", "next": "resources", "menu": { + "Information Extraction": "information-extraction", "Pipeline": "pipeline", - "Matching": "matching", "Training": "training", "Deep Learning": "deep-learning" } diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 6641a83c6..74d562e27 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -2,6 +2,37 @@ include ../_includes/_mixins ++section("information-extraction") + +h(3, "phrase-matcher") Using spaCy's phrase matcher + +tag-new(2) + + p + | This example shows how to use the new + | #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find + | entities from a large terminology list. + + +github("spacy", "examples/information_extraction/phrase_matcher.py") + + +h(3, "entity-relations") Extracting entity relations + + p + | A simple example of extracting relations between phrases and + | entities using spaCy's named entity recognizer and the dependency + | parse. Here, we extract money and currency values (entities labelled + | as #[code MONEY]) and then check the dependency tree to find the + | noun phrase they are referring to – for example: "$9.4 million" + | → "Net income". + + +github("spacy", "examples/information_extraction/entity_relations.py") + + +h(3, "subtrees") Navigating the parse tree and subtrees + + p + | This example shows how to navigate the parse tree including subtrees + | attached to a word. + + +github("spacy", "examples/information_extraction/parse_subtrees.py") + +section("pipeline") +h(3, "custom-components-entities") Custom pipeline components and attribute extensions +tag-new(2) @@ -40,26 +71,6 @@ include ../_includes/_mixins +github("spacy", "examples/pipeline/custom_attr_methods.py") -+section("matching") - +h(3, "matcher") Using spaCy's rule-based matcher - - p - | This example shows how to use spaCy's rule-based - | #[+api("matcher") #[code Matcher]] to find and label entities across - | documents. - - +github("spacy", "examples/matcher_example.py") - - +h(3, "phrase-matcher") Using spaCy's phrase matcher - +tag-new(2) - - p - | This example shows how to use the new - | #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find - | entities from a large terminology list. - - +github("spacy", "examples/phrase_matcher.py") - +section("training") +h(3, "training-ner") Training spaCy's Named Entity Recognizer From db843735d3a94826784492709afa0d26129eddd6 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 18:46:25 +0200 Subject: [PATCH 71/99] Remove outdated examples --- examples/inventory_count/Instructions.md | 5 - examples/inventory_count/inventory.py | 35 ----- examples/inventory_count/inventoryCount.py | 92 ------------ examples/inventory_count/main.py | 30 ---- examples/matcher_example.py | 161 --------------------- examples/twitter_filter.py | 36 ----- 6 files changed, 359 deletions(-) delete mode 100644 examples/inventory_count/Instructions.md delete mode 100644 examples/inventory_count/inventory.py delete mode 100644 examples/inventory_count/inventoryCount.py delete mode 100644 examples/inventory_count/main.py delete mode 100644 examples/matcher_example.py delete mode 100644 examples/twitter_filter.py diff --git a/examples/inventory_count/Instructions.md b/examples/inventory_count/Instructions.md deleted file mode 100644 index 456f5d4fe..000000000 --- a/examples/inventory_count/Instructions.md +++ /dev/null @@ -1,5 +0,0 @@ -An example of inventory counting using SpaCy.io NLP library. Meant to show how to instantiate Spacy's English class, and allow reusability by reloading the main module. - -In the future, a better implementation of this library would be to apply machine learning to each query and learn what to classify as the quantitative statement (55 kgs OF), vs the actual item of count (how likely is a preposition object to be the item of count if x,y,z qualifications appear in the statement). - - diff --git a/examples/inventory_count/inventory.py b/examples/inventory_count/inventory.py deleted file mode 100644 index abc031513..000000000 --- a/examples/inventory_count/inventory.py +++ /dev/null @@ -1,35 +0,0 @@ -class Inventory: - """ - Inventory class - a struct{} like feature to house inventory counts - across modules. - """ - originalQuery = None - item = "" - unit = "" - amount = "" - - def __init__(self, statement): - """ - Constructor - only takes in the original query/statement - :return: new Inventory object - """ - - self.originalQuery = statement - pass - - def __str__(self): - return str(self.amount) + ' ' + str(self.unit) + ' ' + str(self.item) - - def printInfo(self): - print '-------------Inventory Count------------' - print "Original Query: " + str(self.originalQuery) - print 'Amount: ' + str(self.amount) - print 'Unit: ' + str(self.unit) - print 'Item: ' + str(self.item) - print '----------------------------------------' - - def isValid(self): - if not self.item or not self.unit or not self.amount or not self.originalQuery: - return False - else: - return True diff --git a/examples/inventory_count/inventoryCount.py b/examples/inventory_count/inventoryCount.py deleted file mode 100644 index b1b7b43c8..000000000 --- a/examples/inventory_count/inventoryCount.py +++ /dev/null @@ -1,92 +0,0 @@ -from inventory import Inventory - - -def runTest(nlp): - testset = [] - testset += [nlp(u'6 lobster cakes')] - testset += [nlp(u'6 avacados')] - testset += [nlp(u'fifty five carrots')] - testset += [nlp(u'i have 55 carrots')] - testset += [nlp(u'i got me some 9 cabbages')] - testset += [nlp(u'i got 65 kgs of carrots')] - - result = [] - for doc in testset: - c = decodeInventoryEntry_level1(doc) - if not c.isValid(): - c = decodeInventoryEntry_level2(doc) - result.append(c) - - for i in result: - i.printInfo() - - -def decodeInventoryEntry_level1(document): - """ - Decodes a basic entry such as: '6 lobster cake' or '6' cakes - @param document : NLP Doc object - :return: Status if decoded correctly (true, false), and Inventory object - """ - count = Inventory(str(document)) - for token in document: - if token.pos_ == (u'NOUN' or u'NNS' or u'NN'): - item = str(token) - - for child in token.children: - if child.dep_ == u'compound' or child.dep_ == u'ad': - item = str(child) + str(item) - elif child.dep_ == u'nummod': - count.amount = str(child).strip() - for numerical_child in child.children: - # this isn't arithmetic rather than treating it such as a string - count.amount = str(numerical_child) + str(count.amount).strip() - else: - print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_) - - count.item = item - count.unit = item - - return count - - -def decodeInventoryEntry_level2(document): - """ - Entry level 2, a more complicated parsing scheme that covers examples such as - 'i have 80 boxes of freshly baked pies' - - @document @param document : NLP Doc object - :return: Status if decoded correctly (true, false), and Inventory object- - """ - - count = Inventory(str(document)) - - for token in document: - # Look for a preposition object that is a noun (this is the item we are counting). - # If found, look at its' dependency (if a preposition that is not indicative of - # inventory location, the dependency of the preposition must be a noun - - if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'): - item = '' - - # Go through all the token's children, these are possible adjectives and other add-ons - # this deals with cases such as 'hollow rounded waffle pancakes" - for i in token.children: - item += ' ' + str(i) - - item += ' ' + str(token) - count.item = item - - # Get the head of the item: - if token.head.dep_ != u'prep': - # Break out of the loop, this is a confusing entry - break - else: - amountUnit = token.head.head - count.unit = str(amountUnit) - - for inner in amountUnit.children: - if inner.pos_ == u'NUM': - count.amount += str(inner) - return count - - diff --git a/examples/inventory_count/main.py b/examples/inventory_count/main.py deleted file mode 100644 index cbc9e25c3..000000000 --- a/examples/inventory_count/main.py +++ /dev/null @@ -1,30 +0,0 @@ -import inventoryCount as mainModule -import os -from spacy.en import English - -if __name__ == '__main__': - """ - Main module for this example - loads the English main NLP class, - and keeps it in RAM while waiting for the user to re-run it. Allows the - developer to re-edit their module under testing without having - to wait as long to load the English class - """ - - # Set the NLP object here for the parameters you want to see, - # or just leave it blank and get all the opts - print "Loading English module... this will take a while." - nlp = English() - print "Done loading English module." - while True: - try: - reload(mainModule) - mainModule.runTest(nlp) - raw_input('================ To reload main module, press Enter ================') - - - except Exception, e: - print "Unexpected error: " + str(e) - continue - - - diff --git a/examples/matcher_example.py b/examples/matcher_example.py deleted file mode 100644 index 041b98a9a..000000000 --- a/examples/matcher_example.py +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import unicode_literals, print_function - -import spacy.en -import spacy.matcher -from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63 - -import plac - - -def main(): - nlp = spacy.en.English() - example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works." - before = nlp(example) - print("Before") - for ent in before.ents: - print(ent.text, ent.label_, [w.tag_ for w in ent]) - # Output: - # Google ORG [u'NNP'] - # google ORG [u'VB'] - # google ORG [u'NNP'] - nlp.matcher.add( - "GoogleNow", # Entity ID: Not really used at the moment. - "PRODUCT", # Entity type: should be one of the types in the NER data - {"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused. - [ # List of patterns that can be Surface Forms of the entity - - # This Surface Form matches "Google Now", verbatim - [ # Each Surface Form is a list of Token Specifiers. - { # This Token Specifier matches tokens whose orth field is "Google" - ORTH: "Google" - }, - { # This Token Specifier matches tokens whose orth field is "Now" - ORTH: "Now" - } - ], - [ # This Surface Form matches "google now", verbatim, and requires - # "google" to have the NNP tag. This helps prevent the pattern from - # matching cases like "I will google now to look up the time" - { - ORTH: "google", - TAG: "NNP" - }, - { - ORTH: "now" - } - ] - ] - ) - after = nlp(example) - print("After") - for ent in after.ents: - print(ent.text, ent.label_, [w.tag_ for w in ent]) - # Output - # Google Now PRODUCT [u'NNP', u'RB'] - # google ORG [u'VB'] - # google now PRODUCT [u'NNP', u'RB'] - # - # You can customize attribute values in the lexicon, and then refer to the - # new attributes in your Token Specifiers. - # This is particularly good for word-set membership. - # - australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart', - 'Darwin', 'Adelaide', 'Perth'] - # Internally, the tokenizer immediately maps each token to a pointer to a - # LexemeC struct. These structs hold various features, e.g. the integer IDs - # of the normalized string forms. - # For our purposes, the key attribute is a 64-bit integer, used as a bit field. - # spaCy currently only uses 12 of the bits for its built-in features, so - # the others are available for use. It's best to use the higher bits, as - # future versions of spaCy may add more flags. For instance, we might add - # a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to - # FLAG63 here. - is_australian_capital = FLAG63 - # Now we need to set the flag value. It's False on all tokens by default, - # so we just need to set it to True for the tokens we want. - # Here we iterate over the strings, and set it on only the literal matches. - for string in australian_capitals: - lexeme = nlp.vocab[string] - lexeme.set_flag(is_australian_capital, True) - print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital)) - print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital)) - # If we want case-insensitive matching, we have to be a little bit more - # round-about, as there's no case-insensitive index to the vocabulary. So - # we have to iterate over the vocabulary. - # We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it - target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals} - for lexeme in nlp.vocab: - if lexeme.lower in target_ids: - lexeme.set_flag(is_australian_capital, True) - print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital)) - print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital)) - print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital)) - # Output - # Sydney True - # sydney False - # Sydney True - # sydney True - # SYDNEY True - # - # The key thing to note here is that we're setting these attributes once, - # over the vocabulary --- and then reusing them at run-time. This means the - # amortized complexity of anything we do this way is going to be O(1). You - # can match over expressions that need to have sets with tens of thousands - # of values, e.g. "all the street names in Germany", and you'll still have - # O(1) complexity. Most regular expression algorithms don't scale well to - # this sort of problem. - # - # Now, let's use this in a pattern - nlp.matcher.add("AuCitySportsTeam", "ORG", {}, - [ - [ - {LOWER: "the"}, - {is_australian_capital: True}, - {TAG: "NNS"} - ], - [ - {LOWER: "the"}, - {is_australian_capital: True}, - {TAG: "NNPS"} - ], - [ - {LOWER: "the"}, - {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney - {is_australian_capital: True}, - {TAG: "NNS"} - ], - [ - {LOWER: "the"}, - {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney - {is_australian_capital: True}, - {TAG: "NNPS"} - ] - ]) - doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders') - for ent in doc.ents: - print(ent.text, ent.label_) - # Output - # the Brisbane Broncos ORG - # the South Darwin Spiders ORG - - -# Output -# Before -# Google ORG [u'NNP'] -# google ORG [u'VB'] -# google ORG [u'NNP'] -# After -# Google Now PRODUCT [u'NNP', u'RB'] -# google ORG [u'VB'] -# google now PRODUCT [u'NNP', u'RB'] -# Sydney True -# sydney False -# Sydney True -# sydney True -# SYDNEY True -# the Brisbane Broncos ORG -# the South Darwin Spiders ORG - -if __name__ == '__main__': - main() - diff --git a/examples/twitter_filter.py b/examples/twitter_filter.py deleted file mode 100644 index b6e4e4e83..000000000 --- a/examples/twitter_filter.py +++ /dev/null @@ -1,36 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function -import plac -import codecs -import pathlib -import random - -import twython -import spacy.en - -import _handler - - -class Connection(twython.TwythonStreamer): - def __init__(self, keys_dir, nlp, query): - keys_dir = pathlib.Path(keys_dir) - read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip() - api_key = map(read, ['key', 'secret', 'token', 'token_secret']) - twython.TwythonStreamer.__init__(self, *api_key) - self.nlp = nlp - self.query = query - - def on_success(self, data): - _handler.handle_tweet(self.nlp, data, self.query) - if random.random() >= 0.1: - reload(_handler) - - -def main(keys_dir, term): - nlp = spacy.en.English() - twitter = Connection(keys_dir, nlp, term) - twitter.statuses.filter(track=term, language='en') - - -if __name__ == '__main__': - plac.call(main) From cc2917c9e8b5f519f3f023e2c8180153897c9f5d Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 18:47:02 +0200 Subject: [PATCH 72/99] Update fastText example and add to examples in docs --- examples/vectors_fast_text.py | 5 +++-- website/usage/_data.json | 1 + website/usage/examples.jade | 12 ++++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py index 323d5803f..159250098 100644 --- a/examples/vectors_fast_text.py +++ b/examples/vectors_fast_text.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # coding: utf8 -"""Load vectors for a language trained using FastText +"""Load vectors for a language trained using fastText https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md """ from __future__ import unicode_literals @@ -10,7 +10,8 @@ import numpy import from spacy.language import Language -@plac.annotations(vectors_loc=("Path to vectors", "positional", None, str)) +@plac.annotations( + vectors_loc=("Path to vectors", "positional", None, str)) def main(vectors_loc): nlp = Language() diff --git a/website/usage/_data.json b/website/usage/_data.json index c34b5f2b0..63e959882 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -199,6 +199,7 @@ "Information Extraction": "information-extraction", "Pipeline": "pipeline", "Training": "training", + "Vectors & Similarity": "vectors", "Deep Learning": "deep-learning" } } diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 74d562e27..808810364 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -119,6 +119,18 @@ include ../_includes/_mixins +github("spacy", "examples/training/train_textcat.py") ++section("vectors") + +h(3, "fasttext") Loading pre-trained FastText vectors + + p + | This simple snippet is all you need to be able to use the Facebook's + | #[+a("https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md") fastText vectors] + | (294 languages, pre-trained on Wikipedia) with spaCy. Once they're + | loaded, the vectors will be available via spaCy's built-in + | #[code similarity()] methods. + + +github("spacy", "examples/vectors_fast_text.py") + +section("deep-learning") +h(3, "keras") Text classification with Keras From b7b285971fb2e0f058e83ebebc4834cb670c4a7c Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 18:47:11 +0200 Subject: [PATCH 73/99] Update examples README --- examples/README.md | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/examples/README.md b/examples/README.md index d7168f613..18a1760ec 100644 --- a/examples/README.md +++ b/examples/README.md @@ -2,20 +2,18 @@ # spaCy examples -The examples are Python scripts with well-behaved command line interfaces. For a full list of spaCy tutorials and code snippets, see the [documentation](https://spacy.io/docs/usage/tutorials). +The examples are Python scripts with well-behaved command line interfaces. For +more detailed usage guides, see the [documentation](https://alpha.spacy.io/usage/). -## How to run an example - -For example, to run the [`nn_text_class.py`](nn_text_class.py) script, do: +To see the available arguments, you can use the `--help` or `-h` flag: ```bash -$ python examples/nn_text_class.py -usage: nn_text_class.py [-h] [-d 3] [-H 300] [-i 5] [-w 40000] [-b 24] - [-r 0.3] [-p 1e-05] [-e 0.005] - data_dir -nn_text_class.py: error: too few arguments +$ python examples/training/train_ner.py --help ``` -You can print detailed help with the `-h` argument. - -While we try to keep the examples up to date, they are not currently exercised by the test suite, as some of them require significant data downloads or take time to train. If you find that an example is no longer running, [please tell us](https://github.com/explosion/spaCy/issues)! We know there's nothing worse than trying to figure out what you're doing wrong, and it turns out your code was never the problem. +While we try to keep the examples up to date, they are not currently exercised +by the test suite, as some of them require significant data downloads or take +time to train. If you find that an example is no longer running, +[please tell us](https://github.com/explosion/spaCy/issues)! We know there's +nothing worse than trying to figure out what you're doing wrong, and it turns +out your code was never the problem. From f81cc0bd1c59776332254a8bb3e43f3b9d0781d7 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 00:31:30 +0200 Subject: [PATCH 74/99] Fix usage of disable_pipes --- examples/training/train_ner.py | 2 +- examples/training/train_new_entity_type.py | 2 +- examples/training/train_parser.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 2e8241ffc..499807d23 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -52,7 +52,7 @@ def main(model=None, output_dir=None, n_iter=100): # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] - with nlp.disable_pipes(*other_pipes) as disabled: # only train NER + with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training(get_data) for itn in range(n_iter): random.shuffle(TRAIN_DATA) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 69ee20e04..ec1e562c6 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -86,7 +86,7 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50): # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] - with nlp.disable_pipes(*other_pipes) as disabled: # only train NER + with nlp.disable_pipes(*other_pipes): # only train NER random.seed(0) optimizer = nlp.begin_training(lambda: []) for itn in range(n_iter): diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 8cd602bcd..30a6f6095 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -66,7 +66,7 @@ def main(model=None, output_dir=None, n_iter=1000): # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] - with nlp.disable_pipes(*other_pipes) as disabled: # only train parser + with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training(lambda: []) for itn in range(n_iter): random.shuffle(TRAIN_DATA) From 4eb5bd02e7640465419ad1a16576d59dab2d11c0 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 00:32:12 +0200 Subject: [PATCH 75/99] Update textcat pre-processing after to_array change --- spacy/_ml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 8a8d355d9..4c4e36412 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -94,7 +94,6 @@ def _zero_init(model): @layerize def _preprocess_doc(docs, drop=0.): keys = [doc.to_array([LOWER]) for doc in docs] - keys = [a[:, 0] for a in keys] ops = Model.ops lengths = ops.asarray([arr.shape[0] for arr in keys]) keys = ops.xp.concatenate(keys) @@ -521,7 +520,6 @@ def zero_init(model): @layerize def preprocess_doc(docs, drop=0.): keys = [doc.to_array([LOWER]) for doc in docs] - keys = [a[:, 0] for a in keys] ops = Model.ops lengths = ops.asarray([arr.shape[0] for arr in keys]) keys = ops.xp.concatenate(keys) From b61866a2e4d22842399531bf885dd6b0074b5eaa Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 00:32:19 +0200 Subject: [PATCH 76/99] Update textcat example --- examples/training/train_textcat.py | 188 ++++++++++++++++------------- 1 file changed, 102 insertions(+), 86 deletions(-) diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 4d07ed26a..2f540b530 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -1,58 +1,119 @@ -'''Train a multi-label convolutional neural network text classifier, -using the spacy.pipeline.TextCategorizer component. The model is then added -to spacy.pipeline, and predictions are available at `doc.cats`. -''' -from __future__ import unicode_literals +#!/usr/bin/env python +# coding: utf8 +"""Train a multi-label convolutional neural network text classifier on the +IMDB dataset, using the TextCategorizer component. The dataset will be loaded +automatically via Thinc's built-in dataset loader. The model is then added to +spacy.pipeline, and predictions are available via `doc.cats`. + +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* Text classification: https://alpha.spacy.io/usage/text-classification + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function import plac import random -import tqdm - -from thinc.neural.optimizers import Adam -from thinc.neural.ops import NumpyOps +from pathlib import Path import thinc.extra.datasets -import spacy.lang.en +import spacy from spacy.gold import GoldParse, minibatch from spacy.util import compounding from spacy.pipeline import TextCategorizer -# TODO: Remove this once we're not supporting models trained with thinc <6.9.0 -import thinc.neural._classes.layernorm -thinc.neural._classes.layernorm.set_compat_six_eight(False) +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(model=None, output_dir=None, n_iter=20): + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") -def train_textcat(tokenizer, textcat, - train_texts, train_cats, dev_texts, dev_cats, - n_iter=20): - ''' - Train the TextCategorizer without associated pipeline. - ''' - textcat.begin_training() - optimizer = Adam(NumpyOps(), 0.001) - train_docs = [tokenizer(text) for text in train_texts] + # add the text classifier to the pipeline if it doesn't exist + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'textcat' not in nlp.pipe_names: + # textcat = nlp.create_pipe('textcat') + textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE']) + nlp.add_pipe(textcat, first=True) + # otherwise, get it, so we can add labels to it + else: + textcat = nlp.get_pipe('textcat') + + # add label to text classifier + # textcat.add_label('POSITIVE') + + # load the IMBD dataset + print("Loading IMDB data...") + (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000) + train_docs = [nlp.tokenizer(text) for text in train_texts] train_gold = [GoldParse(doc, cats=cats) for doc, cats in zip(train_docs, train_cats)] train_data = list(zip(train_docs, train_gold)) - batch_sizes = compounding(4., 128., 1.001) - for i in range(n_iter): - losses = {} - # Progress bar and minibatching - batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes) - for batch in batches: - docs, golds = zip(*batch) - textcat.update(docs, golds, sgd=optimizer, drop=0.2, - losses=losses) - with textcat.model.use_params(optimizer.averages): - scores = evaluate(tokenizer, textcat, dev_texts, dev_cats) - yield losses['textcat'], scores + + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] + with nlp.disable_pipes(*other_pipes): # only train textcat + optimizer = nlp.begin_training(lambda: []) + print("Training the model...") + print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) + for i in range(n_iter): + losses = {} + # batch up the examples using spaCy's minibatch + batches = minibatch(train_data, size=compounding(4., 128., 1.001)) + for batch in batches: + docs, golds = zip(*batch) + nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) + with textcat.model.use_params(optimizer.averages): + # evaluate on the dev data split off in load_data() + scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) + print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}' # print a simple table + .format(losses['textcat'], scores['textcat_p'], + scores['textcat_r'], scores['textcat_f'])) + + # test the trained model + test_text = "This movie sucked" + doc = nlp(test_text) + print(test_text, doc.cats) + + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc2 = nlp2(test_text) + print(test_text, doc2.cats) + + +def load_data(limit=0, split=0.8): + """Load data from the IMDB dataset.""" + # Partition off part of the train data for evaluation + train_data, _ = thinc.extra.datasets.imdb() + random.shuffle(train_data) + train_data = train_data[-limit:] + texts, labels = zip(*train_data) + cats = [{'POSITIVE': bool(y)} for y in labels] + split = int(len(train_data) * split) + return (texts[:split], cats[:split]), (texts[split:], cats[split:]) def evaluate(tokenizer, textcat, texts, cats): docs = (tokenizer(text) for text in texts) - tp = 1e-8 # True positives - fp = 1e-8 # False positives - fn = 1e-8 # False negatives - tn = 1e-8 # True negatives + tp = 1e-8 # True positives + fp = 1e-8 # False positives + fn = 1e-8 # False negatives + tn = 1e-8 # True negatives for i, doc in enumerate(textcat.pipe(docs)): gold = cats[i] for label, score in doc.cats.items(): @@ -66,55 +127,10 @@ def evaluate(tokenizer, textcat, texts, cats): tn += 1 elif score < 0.5 and gold[label] >= 0.5: fn += 1 - precis = tp / (tp + fp) + precision = tp / (tp + fp) recall = tp / (tp + fn) - fscore = 2 * (precis * recall) / (precis + recall) - return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore} - - -def load_data(limit=0): - # Partition off part of the train data --- avoid running experiments - # against test. - train_data, _ = thinc.extra.datasets.imdb() - - random.shuffle(train_data) - train_data = train_data[-limit:] - - texts, labels = zip(*train_data) - cats = [{'POSITIVE': bool(y)} for y in labels] - - split = int(len(train_data) * 0.8) - - train_texts = texts[:split] - train_cats = cats[:split] - dev_texts = texts[split:] - dev_cats = cats[split:] - return (train_texts, train_cats), (dev_texts, dev_cats) - - -def main(model_loc=None): - nlp = spacy.lang.en.English() - tokenizer = nlp.tokenizer - textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE']) - - print("Load IMDB data") - (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000) - - print("Itn.\tLoss\tP\tR\tF") - progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}' - - for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat, - train_texts, train_cats, - dev_texts, dev_cats, n_iter=20)): - print(progress.format(i=i, loss=loss, **scores)) - # How to save, load and use - nlp.pipeline.append(textcat) - if model_loc is not None: - nlp.to_disk(model_loc) - - nlp = spacy.load(model_loc) - doc = nlp(u'This movie sucked!') - print(doc.cats) + f_score = 2 * (precision * recall) / (precision + recall) + return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score} if __name__ == '__main__': From a7b9074b4c06920d86e610647abbb550cf2f16c3 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 00:48:45 +0200 Subject: [PATCH 77/99] Update textcat training example and docs --- examples/training/train_textcat.py | 4 +- website/usage/_training/_textcat.jade | 62 +++++++++++++++++++++++--- website/usage/examples.jade | 9 ++-- website/usage/text-classification.jade | 6 +-- 4 files changed, 65 insertions(+), 16 deletions(-) diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 2f540b530..1f9cd29aa 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -2,7 +2,7 @@ # coding: utf8 """Train a multi-label convolutional neural network text classifier on the IMDB dataset, using the TextCategorizer component. The dataset will be loaded -automatically via Thinc's built-in dataset loader. The model is then added to +automatically via Thinc's built-in dataset loader. The model is added to spacy.pipeline, and predictions are available via `doc.cats`. For more details, see the documentation: @@ -41,7 +41,7 @@ def main(model=None, output_dir=None, n_iter=20): if 'textcat' not in nlp.pipe_names: # textcat = nlp.create_pipe('textcat') textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE']) - nlp.add_pipe(textcat, first=True) + nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') diff --git a/website/usage/_training/_textcat.jade b/website/usage/_training/_textcat.jade index 5c90519db..ad863bce1 100644 --- a/website/usage/_training/_textcat.jade +++ b/website/usage/_training/_textcat.jade @@ -1,13 +1,63 @@ //- 💫 DOCS > USAGE > TRAINING > TEXT CLASSIFICATION -+under-construction - -+h(3, "example-textcat") Example: Training spaCy's text classifier ++h(3, "example-textcat") Adding a text classifier to a spaCy model +tag-new(2) p - | This example shows how to use and train spaCy's new - | #[+api("textcategorizer") #[code TextCategorizer]] pipeline component - | on IMDB movie reviews. + | This example shows how to train a multi-label convolutional neural + | network text classifier on IMDB movie reviews, using spaCy's new + | #[+api("textcategorizer") #[code TextCategorizer]] component. The + | dataset will be loaded automatically via Thinc's built-in dataset + | loader. Predictions are available via + | #[+api("doc#attributes") #[code Doc.cats]]. +github("spacy", "examples/training/train_textcat.py") + ++h(4) Step by step guide + ++list("numbers") + +item + | #[strong Load the model] you want to start with, or create an + | #[strong empty model] using + | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your + | language. If you're using a blank model, don't forget to add the + | parser to the pipeline. If you're using an existing model, + | make sure to disable all other pipeline components during training + | using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. + | This way, you'll only be training the parser. + + +item + | #[strong Add the text classifier] to the pipeline, and add the labels + | you want to train – for example, #[code POSITIVE]. + + +item + | #[strong Load and pre-process the dataset], shuffle the data and + | split off a part of it to hold back for evaluation. This way, you'll + | be able to see results on each training iteration. + + +item + | #[strong Loop over] the training examples, partition them into + | batches and create #[code Doc] and #[code GoldParse] objects for each + | example in the batch. + + +item + | #[strong Update the model] by calling + | #[+api("language#update") #[code nlp.update]], which steps + | through the examples and makes a #[strong prediction]. It then + | consults the annotations provided on the #[code GoldParse] instance, + | to see whether it was right. If it was wrong, it adjusts its weights + | so that the correct prediction will score higher next time. + + +item + | Optionally, you can also #[strong evaluate the text classifier] on + | each iteration, by checking how it performs on the development data + | held back from the dataset. This lets you print the + | #[strong precision], #[strong recall] and #[strong F-score]. + + +item + | #[strong Save] the trained model using + | #[+api("language#to_disk") #[code nlp.to_disk]]. + + +item + | #[strong Test] the model to make sure the text classifier works as + | expected. diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 808810364..525d584a1 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -113,9 +113,12 @@ include ../_includes/_mixins +tag-new(2) p - | This example shows how to use and train spaCy's new - | #[+api("textcategorizer") #[code TextCategorizer]] pipeline component - | on IMDB movie reviews. + | This example shows how to train a multi-label convolutional neural + | network text classifier on IMDB movie reviews, using spaCy's new + | #[+api("textcategorizer") #[code TextCategorizer]] component. The + | dataset will be loaded automatically via Thinc's built-in dataset + | loader. Predictions are available via + | #[+api("doc#attributes") #[code Doc.cats]]. +github("spacy", "examples/training/train_textcat.py") diff --git a/website/usage/text-classification.jade b/website/usage/text-classification.jade index 8a0e93450..9e43d185c 100644 --- a/website/usage/text-classification.jade +++ b/website/usage/text-classification.jade @@ -2,8 +2,4 @@ include ../_includes/_mixins -+under-construction - -+h(2, "example") Example - -+github("spacy", "examples/training/train_textcat.py") +include _training/_textcat From 647ef64f8696d667481c149cefba269b2dae9755 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 00:51:29 +0200 Subject: [PATCH 78/99] Update textcat docs --- website/usage/_training/_textcat.jade | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/website/usage/_training/_textcat.jade b/website/usage/_training/_textcat.jade index ad863bce1..5ccff7a84 100644 --- a/website/usage/_training/_textcat.jade +++ b/website/usage/_training/_textcat.jade @@ -20,11 +20,10 @@ p | #[strong Load the model] you want to start with, or create an | #[strong empty model] using | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your - | language. If you're using a blank model, don't forget to add the - | parser to the pipeline. If you're using an existing model, - | make sure to disable all other pipeline components during training - | using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. - | This way, you'll only be training the parser. + | language. If you're using an existing model, make sure to disable all + | other pipeline components during training using + | #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. This + | way, you'll only be training the text classifier. +item | #[strong Add the text classifier] to the pipeline, and add the labels From 096a80170d23365e1b8ff9d3749bb6caa379abdd Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 01:48:39 +0200 Subject: [PATCH 79/99] Remove old example files --- examples/_handler.py | 37 ------------------- examples/parallel_parse.py | 74 -------------------------------------- 2 files changed, 111 deletions(-) delete mode 100644 examples/_handler.py delete mode 100644 examples/parallel_parse.py diff --git a/examples/_handler.py b/examples/_handler.py deleted file mode 100644 index cebfe8968..000000000 --- a/examples/_handler.py +++ /dev/null @@ -1,37 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - -from math import sqrt -from numpy import dot -from numpy.linalg import norm - - -def handle_tweet(spacy, tweet_data, query): - text = tweet_data.get('text', u'') - # Twython returns either bytes or unicode, depending on tweet. - # ಠ_ಠ #APIshaming - try: - match_tweet(spacy, text, query) - except TypeError: - match_tweet(spacy, text.decode('utf8'), query) - - -def match_tweet(spacy, text, query): - def get_vector(word): - return spacy.vocab[word].repvec - - tweet = spacy(text) - tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query] - if tweet: - accept = map(get_vector, 'child classroom teach'.split()) - reject = map(get_vector, 'mouth hands giveaway'.split()) - - y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept) - n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject) - - if (y / (y + n)) >= 0.5 or True: - print(text) - - -def cos(v1, v2): - return dot(v1, v2) / (norm(v1) * norm(v2)) diff --git a/examples/parallel_parse.py b/examples/parallel_parse.py deleted file mode 100644 index 5cdd0778b..000000000 --- a/examples/parallel_parse.py +++ /dev/null @@ -1,74 +0,0 @@ -from __future__ import print_function, unicode_literals, division -import io -import bz2 -import logging -from toolz import partition -from os import path -import re - -import spacy.en -from spacy.tokens import Doc - -from joblib import Parallel, delayed -import plac -import ujson - - -def parallelize(func, iterator, n_jobs, extra, backend='multiprocessing'): - extra = tuple(extra) - return Parallel(n_jobs=n_jobs, backend=backend)(delayed(func)(*(item + extra)) - for item in iterator) - - -def iter_comments(loc): - with bz2.BZ2File(loc) as file_: - for i, line in enumerate(file_): - yield ujson.loads(line)['body'] - - -pre_format_re = re.compile(r'^[\`\*\~]') -post_format_re = re.compile(r'[\`\*\~]$') -url_re = re.compile(r'\[([^]]+)\]\(%%URL\)') -link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)') -def strip_meta(text): - text = link_re.sub(r'\1', text) - text = text.replace('>', '>').replace('<', '<') - text = pre_format_re.sub('', text) - text = post_format_re.sub('', text) - return text.strip() - - -def save_parses(batch_id, input_, out_dir, n_threads, batch_size): - out_loc = path.join(out_dir, '%d.bin' % batch_id) - if path.exists(out_loc): - return None - print('Batch', batch_id) - nlp = spacy.en.English() - nlp.matcher = None - with open(out_loc, 'wb') as file_: - texts = (strip_meta(text) for text in input_) - texts = (text for text in texts if text.strip()) - for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=n_threads): - file_.write(doc.to_bytes()) - -@plac.annotations( - in_loc=("Location of input file"), - out_dir=("Location of input file"), - n_process=("Number of processes", "option", "p", int), - n_thread=("Number of threads per process", "option", "t", int), - batch_size=("Number of texts to accumulate in a buffer", "option", "b", int) -) -def main(in_loc, out_dir, n_process=1, n_thread=4, batch_size=100): - if not path.exists(out_dir): - path.join(out_dir) - if n_process >= 2: - texts = partition(200000, iter_comments(in_loc)) - parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size], - backend='multiprocessing') - else: - save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size) - - - -if __name__ == '__main__': - plac.call(main) From ed69bd69f4cb7dcc8ba9f70cdc2e4de197520869 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 01:48:52 +0200 Subject: [PATCH 80/99] Update parallel tagging example --- examples/parallel_tag.py | 71 +++++++++++++++++++++++++++++++ examples/pos_tag.py | 90 ---------------------------------------- 2 files changed, 71 insertions(+), 90 deletions(-) create mode 100644 examples/parallel_tag.py delete mode 100644 examples/pos_tag.py diff --git a/examples/parallel_tag.py b/examples/parallel_tag.py new file mode 100644 index 000000000..a6571a2ac --- /dev/null +++ b/examples/parallel_tag.py @@ -0,0 +1,71 @@ +""" +Print part-of-speech tagged, true-cased, (very roughly) sentence-separated +text, with each "sentence" on a newline, and spaces between tokens. Supports +multi-processing. + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import print_function, unicode_literals, division +from toolz import partition_all +from pathlib import Path +from joblib import Parallel, delayed +import thinc.extra.datasets +import plac +import spacy + + +@plac.annotations( + output_dir=("Output directory", "positional", None, Path), + model=("Model name (needs tagger)", "positional", None, str), + n_jobs=("Number of workers", "option", "n", int), + batch_size=("Batch-size for each process", "option", "b", int), + limit=("Limit of entries from the dataset", "option", "l", int)) +def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000, + limit=10000): + nlp = spacy.load(model) # load spaCy model + print("Loaded model '%s'" % model) + if not output_dir.exists(): + output_dir.mkdir() + # load and pre-process the IMBD dataset + print("Loading IMDB data...") + data, _ = thinc.extra.datasets.imdb() + texts, _ = zip(*data[-limit:]) + partitions = partition_all(batch_size, texts) + items = ((i, [nlp(text) for text in texts], output_dir) for i, texts + in enumerate(partitions)) + Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items) + + +def transform_texts(batch_id, docs, output_dir): + out_path = Path(output_dir) / ('%d.txt' % batch_id) + if out_path.exists(): # return None in case same batch is called again + return None + print('Processing batch', batch_id) + with out_path.open('w', encoding='utf8') as f: + for doc in docs: + f.write(' '.join(represent_word(w) for w in doc if not w.is_space)) + f.write('\n') + print('Saved {} texts to {}.txt'.format(len(docs), batch_id)) + + +def represent_word(word): + text = word.text + # True-case, i.e. try to normalize sentence-initial capitals. + # Only do this if the lower-cased form is more probable. + if text.istitle() and is_sent_begin(word) \ + and word.prob < word.doc.vocab[text.lower()].prob: + text = text.lower() + return text + '|' + word.tag_ + + +def is_sent_begin(word): + if word.i == 0: + return True + elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'): + return True + else: + return False + + +if __name__ == '__main__': + plac.call(main) diff --git a/examples/pos_tag.py b/examples/pos_tag.py deleted file mode 100644 index 1dd6add0f..000000000 --- a/examples/pos_tag.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Print part-of-speech tagged, true-cased, (very roughly) sentence-separated -text, with each "sentence" on a newline, and spaces between tokens. Supports -multi-processing. -""" -from __future__ import print_function, unicode_literals, division -import io -import bz2 -import logging -from toolz import partition -from os import path - -import spacy.en - -from joblib import Parallel, delayed -import plac -import ujson - - -def parallelize(func, iterator, n_jobs, extra): - extra = tuple(extra) - return Parallel(n_jobs=n_jobs)(delayed(func)(*(item + extra)) for item in iterator) - - -def iter_texts_from_json_bz2(loc): - """ - Iterator of unicode strings, one per document (here, a comment). - - Expects a a path to a BZ2 file, which should be new-line delimited JSON. The - document text should be in a string field titled 'body'. - - This is the data format of the Reddit comments corpus. - """ - with bz2.BZ2File(loc) as file_: - for i, line in enumerate(file_): - yield ujson.loads(line)['body'] - - -def transform_texts(batch_id, input_, out_dir): - out_loc = path.join(out_dir, '%d.txt' % batch_id) - if path.exists(out_loc): - return None - print('Batch', batch_id) - nlp = spacy.en.English(parser=False, entity=False) - with io.open(out_loc, 'w', encoding='utf8') as file_: - for text in input_: - doc = nlp(text) - file_.write(' '.join(represent_word(w) for w in doc if not w.is_space)) - file_.write('\n') - - -def represent_word(word): - text = word.text - # True-case, i.e. try to normalize sentence-initial capitals. - # Only do this if the lower-cased form is more probable. - if text.istitle() \ - and is_sent_begin(word) \ - and word.prob < word.doc.vocab[text.lower()].prob: - text = text.lower() - return text + '|' + word.tag_ - - -def is_sent_begin(word): - # It'd be nice to have some heuristics like these in the library, for these - # times where we don't care so much about accuracy of SBD, and we don't want - # to parse - if word.i == 0: - return True - elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'): - return True - else: - return False - - -@plac.annotations( - in_loc=("Location of input file"), - out_dir=("Location of input file"), - n_workers=("Number of workers", "option", "n", int), - batch_size=("Batch-size for each process", "option", "b", int) -) -def main(in_loc, out_dir, n_workers=4, batch_size=100000): - if not path.exists(out_dir): - path.join(out_dir) - texts = partition(batch_size, iter_texts_from_json_bz2(in_loc)) - parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) - - -if __name__ == '__main__': - plac.call(main) - From 4eabaafd667c97c2f5e9bbd65cf2fd775b0fbef8 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 01:50:44 +0200 Subject: [PATCH 81/99] Update docstring and example --- examples/parallel_tag.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/parallel_tag.py b/examples/parallel_tag.py index a6571a2ac..445b9fb69 100644 --- a/examples/parallel_tag.py +++ b/examples/parallel_tag.py @@ -1,11 +1,11 @@ """ -Print part-of-speech tagged, true-cased, (very roughly) sentence-separated -text, with each "sentence" on a newline, and spaces between tokens. Supports -multi-processing. +Example of multi-processing with joblib. Here, we're exporting +part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with +each "sentence" on a newline, and spaces between tokens. Last updated for: spaCy 2.0.0a18 """ -from __future__ import print_function, unicode_literals, division +from __future__ import print_function, unicode_literals from toolz import partition_all from pathlib import Path from joblib import Parallel, delayed From 1d69a46cd4afa6cdc4d79e39cacf26c97d7c1c8a Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 01:58:55 +0200 Subject: [PATCH 82/99] Update multi-processing example and add to docs --- examples/parallel_tag.py | 6 ++++-- website/usage/_data.json | 2 +- .../_processing-pipelines/_multithreading.jade | 13 +++++++++++++ website/usage/examples.jade | 13 +++++++++++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/examples/parallel_tag.py b/examples/parallel_tag.py index 445b9fb69..19b1c462a 100644 --- a/examples/parallel_tag.py +++ b/examples/parallel_tag.py @@ -1,7 +1,9 @@ """ -Example of multi-processing with joblib. Here, we're exporting +Example of multi-processing with Joblib. Here, we're exporting part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with -each "sentence" on a newline, and spaces between tokens. +each "sentence" on a newline, and spaces between tokens. Data is loaded from +the IMDB movie reviews dataset and will be loaded automatically via Thinc's +built-in dataset loader. Last updated for: spaCy 2.0.0a18 """ diff --git a/website/usage/_data.json b/website/usage/_data.json index 63e959882..4a4e6df01 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -106,7 +106,7 @@ "How Pipelines Work": "pipelines", "Custom Components": "custom-components", "Developing Extensions": "extensions", - "Multi-threading": "multithreading", + "Multi-Threading": "multithreading", "Serialization": "serialization" } }, diff --git a/website/usage/_processing-pipelines/_multithreading.jade b/website/usage/_processing-pipelines/_multithreading.jade index 1e08508b8..206879e28 100644 --- a/website/usage/_processing-pipelines/_multithreading.jade +++ b/website/usage/_processing-pipelines/_multithreading.jade @@ -38,3 +38,16 @@ p | the generator in two, and then #[code izip] the extra stream to the | document stream. Here's | #[+a(gh("spacy") + "/issues/172#issuecomment-183963403") an example]. + ++h(3, "multi-processing-example") Example: Multi-processing with Joblib + +p + | This example shows how to use multiple cores to process text using + | spaCy and #[+a("https://pythonhosted.org/joblib/") Joblib]. We're + | exporting part-of-speech-tagged, true-cased, (very roughly) + | sentence-separated text, with each "sentence" on a newline, and + | spaces between tokens. Data is loaded from the IMDB movie reviews + | dataset and will be loaded automatically via Thinc's built-in dataset + | loader. + ++github("spacy", "examples/parallel_tag.py") diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 525d584a1..b00de183b 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -71,6 +71,19 @@ include ../_includes/_mixins +github("spacy", "examples/pipeline/custom_attr_methods.py") + +h(3, "parallel-tag") Multi-processing with Joblib + + p + | This example shows how to use multiple cores to process text using + | spaCy and #[+a("https://pythonhosted.org/joblib/") Joblib]. We're + | exporting part-of-speech-tagged, true-cased, (very roughly) + | sentence-separated text, with each "sentence" on a newline, and + | spaces between tokens. Data is loaded from the IMDB movie reviews + | dataset and will be loaded automatically via Thinc's built-in dataset + | loader. + + +github("spacy", "examples/parallel_tag.py") + +section("training") +h(3, "training-ner") Training spaCy's Named Entity Recognizer From af28ca1ba09136c5e01d4e7235c69b3b1609632b Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 02:00:01 +0200 Subject: [PATCH 83/99] Move example to pipeline directory --- examples/{parallel_tag.py => pipeline/multi_processing.py} | 0 website/usage/_processing-pipelines/_multithreading.jade | 2 +- website/usage/examples.jade | 4 ++-- 3 files changed, 3 insertions(+), 3 deletions(-) rename examples/{parallel_tag.py => pipeline/multi_processing.py} (100%) diff --git a/examples/parallel_tag.py b/examples/pipeline/multi_processing.py similarity index 100% rename from examples/parallel_tag.py rename to examples/pipeline/multi_processing.py diff --git a/website/usage/_processing-pipelines/_multithreading.jade b/website/usage/_processing-pipelines/_multithreading.jade index 206879e28..a80768f38 100644 --- a/website/usage/_processing-pipelines/_multithreading.jade +++ b/website/usage/_processing-pipelines/_multithreading.jade @@ -50,4 +50,4 @@ p | dataset and will be loaded automatically via Thinc's built-in dataset | loader. -+github("spacy", "examples/parallel_tag.py") ++github("spacy", "examples/pipeline/multi_processing.py") diff --git a/website/usage/examples.jade b/website/usage/examples.jade index b00de183b..a97471dbe 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -71,7 +71,7 @@ include ../_includes/_mixins +github("spacy", "examples/pipeline/custom_attr_methods.py") - +h(3, "parallel-tag") Multi-processing with Joblib + +h(3, "multi-processing") Multi-processing with Joblib p | This example shows how to use multiple cores to process text using @@ -82,7 +82,7 @@ include ../_includes/_mixins | dataset and will be loaded automatically via Thinc's built-in dataset | loader. - +github("spacy", "examples/parallel_tag.py") + +github("spacy", "examples/pipeline/multi_processing.py") +section("training") +h(3, "training-ner") Training spaCy's Named Entity Recognizer From 3ed71c46be73a03d38e7157d44ede4fd80634ded Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Oct 2017 02:29:40 +0200 Subject: [PATCH 84/99] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 9cffd2cae..a503abbc0 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,7 @@ integration. It's commercial open-source software, released under the MIT licens =================== === `spaCy 101`_ New to spaCy? Here's everything you need to know! `Usage Guides`_ How to use spaCy and its features. -`New in v2.0`_ New features, backwards incompatibilitiies and migration guide. +`New in v2.0`_ New features, backwards incompatibilities and migration guide. `API Reference`_ The detailed reference for spaCy's API. `Models`_ Download statistical language models for spaCy. `Resources`_ Libraries, extensions, demos, books and courses. From 44f83b35bc86b791d80ad52c4f44c82559be4507 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 02:58:14 +0200 Subject: [PATCH 85/99] Update pipeline component examples to use plac --- examples/pipeline/custom_attr_methods.py | 63 ++++++++++++------ .../custom_component_countries_api.py | 65 ++++++++++++------- .../pipeline/custom_component_entities.py | 60 ++++++++++++----- 3 files changed, 129 insertions(+), 59 deletions(-) diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py index 9b1a8325d..741541b06 100644 --- a/examples/pipeline/custom_attr_methods.py +++ b/examples/pipeline/custom_attr_methods.py @@ -1,35 +1,60 @@ +#!/usr/bin/env python # coding: utf-8 """This example contains several snippets of methods that can be set via custom Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like they're "bound" to the object and are partially applied – i.e. the object -they're called on is passed in as the first argument.""" +they're called on is passed in as the first argument. + +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals +import plac from spacy.lang.en import English from spacy.tokens import Doc, Span from spacy import displacy from pathlib import Path +@plac.annotations( + output_dir=("Output directory for saved HTML", "positional", None, Path)) +def main(output_dir=None): + nlp = English() # start off with blank English class + + Doc.set_extension('overlap', method=overlap_tokens) + doc1 = nlp(u"Peach emoji is where it has always been.") + doc2 = nlp(u"Peach is the superior emoji.") + print("Text 1:", doc1.text) + print("Text 2:", doc2.text) + print("Overlapping tokens:", doc1._.overlap(doc2)) + + Doc.set_extension('to_html', method=to_html) + doc = nlp(u"This is a sentence about Apple.") + # add entity manually for demo purposes, to make it work without a model + doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] + print("Text:", doc.text) + doc._.to_html(output=output_dir, style='ent') + + def to_html(doc, output='/tmp', style='dep'): """Doc method extension for saving the current state as a displaCy visualization. """ # generate filename from first six non-punct tokens file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html' - output_path = Path(output) / file_name html = displacy.render(doc, style=style, page=True) # render markup - output_path.open('w', encoding='utf-8').write(html) # save to file - print('Saved HTML to {}'.format(output_path)) - - -Doc.set_extension('to_html', method=to_html) - -nlp = English() -doc = nlp(u"This is a sentence about Apple.") -# add entity manually for demo purposes, to make it work without a model -doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] -doc._.to_html(style='ent') + if output is not None: + output_path = Path(output) + if not output_path.exists(): + output_path.mkdir() + output_file = Path(output) / file_name + output_file.open('w', encoding='utf-8').write(html) # save to file + print('Saved HTML to {}'.format(output_file)) + else: + print(html) def overlap_tokens(doc, other_doc): @@ -43,10 +68,10 @@ def overlap_tokens(doc, other_doc): return overlap -Doc.set_extension('overlap', method=overlap_tokens) +if __name__ == '__main__': + plac.call(main) -nlp = English() -doc1 = nlp(u"Peach emoji is where it has always been.") -doc2 = nlp(u"Peach is the superior emoji.") -tokens = doc1._.overlap(doc2) -print(tokens) + # Expected output: + # Text 1: Peach emoji is where it has always been. + # Text 2: Peach is the superior emoji. + # Overlapping tokens: [Peach, emoji, is, .] diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py index 2554af967..38eec7384 100644 --- a/examples/pipeline/custom_component_countries_api.py +++ b/examples/pipeline/custom_component_countries_api.py @@ -1,21 +1,45 @@ -# coding: utf-8 +#!/usr/bin/env python +# coding: utf8 +"""Example of a spaCy v2.0 pipeline component that requests all countries via +the REST Countries API, merges country names into one token, assigns entity +labels and sets attributes on country tokens, e.g. the capital and lat/lng +coordinates. Can be extended with more details from the API. + +* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0) +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals import requests - +import plac from spacy.lang.en import English from spacy.matcher import PhraseMatcher from spacy.tokens import Doc, Span, Token -class RESTCountriesComponent(object): - """Example of a spaCy v2.0 pipeline component that requests all countries - via the REST Countries API, merges country names into one token, assigns - entity labels and sets attributes on country tokens, e.g. the capital and - lat/lng coordinates. Can be extended with more details from the API. +def main(): + # For simplicity, we start off with only the blank English Language class + # and no model or pre-defined pipeline loaded. + nlp = English() + rest_countries = RESTCountriesComponent(nlp) # initialise component + nlp.add_pipe(rest_countries) # add it to the pipeline + doc = nlp(u"Some text about Colombia and the Czech Republic") + print('Pipeline', nlp.pipe_names) # pipeline contains component name + print('Doc has countries', doc._.has_country) # Doc contains countries + for token in doc: + if token._.is_country: + print(token.text, token._.country_capital, token._.country_latlng, + token._.country_flag) # country data + print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities - REST Countries API: https://restcountries.eu - API License: Mozilla Public License MPL 2.0 + +class RESTCountriesComponent(object): + """spaCy v2.0 pipeline component that requests all countries via + the REST Countries API, merges country names into one token, assigns entity + labels and sets attributes on country tokens. """ name = 'rest_countries' # component name, will show up in the pipeline @@ -90,19 +114,12 @@ class RESTCountriesComponent(object): return any([t._.get('is_country') for t in tokens]) -# For simplicity, we start off with only the blank English Language class and -# no model or pre-defined pipeline loaded. +if __name__ == '__main__': + plac.call(main) -nlp = English() -rest_countries = RESTCountriesComponent(nlp) # initialise component -nlp.add_pipe(rest_countries) # add it to the pipeline - -doc = nlp(u"Some text about Colombia and the Czech Republic") - -print('Pipeline', nlp.pipe_names) # pipeline contains component name -print('Doc has countries', doc._.has_country) # Doc contains countries -for token in doc: - if token._.is_country: - print(token.text, token._.country_capital, token._.country_latlng, - token._.country_flag) # country data -print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities + # Expected output: + # Pipeline ['rest_countries'] + # Doc has countries True + # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg + # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg + # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')] diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py index a0d9c61ec..050a89905 100644 --- a/examples/pipeline/custom_component_entities.py +++ b/examples/pipeline/custom_component_entities.py @@ -1,11 +1,45 @@ -# coding: utf-8 +#!/usr/bin/env python +# coding: utf8 +"""Example of a spaCy v2.0 pipeline component that sets entity annotations +based on list of single or multiple-word company names. Companies are +labelled as ORG and their spans are merged into one token. Additionally, +._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token +respectively. + +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals +import plac from spacy.lang.en import English from spacy.matcher import PhraseMatcher from spacy.tokens import Doc, Span, Token +@plac.annotations( + text=("Text to process", "positional", None, str), + companies=("Names of technology companies", "positional", None, str)) +def main(text="Alphabet Inc. is the company behind Google.", *companies): + # For simplicity, we start off with only the blank English Language class + # and no model or pre-defined pipeline loaded. + nlp = English() + if not companies: # set default companies if none are set via args + companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. + component = TechCompanyRecognizer(nlp, companies) # initialise component + nlp.add_pipe(component, last=True) # add last to the pipeline + + doc = nlp(text) + print('Pipeline', nlp.pipe_names) # pipeline contains component name + print('Tokens', [t.text for t in doc]) # company names from the list are merged + print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs + print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org + print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not + print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities + + class TechCompanyRecognizer(object): """Example of a spaCy v2.0 pipeline component that sets entity annotations based on list of single or multiple-word company names. Companies are @@ -67,19 +101,13 @@ class TechCompanyRecognizer(object): return any([t._.get('is_tech_org') for t in tokens]) -# For simplicity, we start off with only the blank English Language class and -# no model or pre-defined pipeline loaded. +if __name__ == '__main__': + plac.call(main) -nlp = English() -companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. -component = TechCompanyRecognizer(nlp, companies) # initialise component -nlp.add_pipe(component, last=True) # add it to the pipeline as the last element - -doc = nlp(u"Alphabet Inc. is the company behind Google.") - -print('Pipeline', nlp.pipe_names) # pipeline contains component name -print('Tokens', [t.text for t in doc]) # company names from the list are merged -print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs -print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org -print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not -print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities + # Expected output: + # Pipeline ['tech_companies'] + # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.'] + # Doc has_tech_org True + # Token 0 is_tech_org True + # Token 1 is_tech_org False + # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')] From bb25bdcd923534108691174850449f98711c6834 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 01:16:55 +0000 Subject: [PATCH 86/99] Adjust call to scatter_add for the new version --- spacy/syntax/nn_parser.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c9a4926fc..96fdbab6d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,5 +1,4 @@ # cython: infer_types=True -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # coding: utf-8 @@ -435,8 +434,7 @@ cdef class Parser: cdef int nr_hidden = hidden_weights.shape[0] cdef int nr_task = states.size() with nogil: - for i in cython.parallel.prange(nr_task, num_threads=2, - schedule='guided'): + for i in range(nr_task): self._parseC(states[i], feat_weights, bias, hW, hb, nr_class, nr_hidden, nr_feat, nr_piece) @@ -697,9 +695,10 @@ cdef class Parser: xp = get_array_module(d_tokvecs) for ids, d_vector, bp_vector in backprops: d_state_features = bp_vector(d_vector, sgd=sgd) - mask = ids >= 0 - d_state_features *= mask.reshape(ids.shape + (1,)) - self.model[0].ops.scatter_add(d_tokvecs, ids * mask, + ids = ids.flatten() + d_state_features = d_state_features.reshape( + (ids.size, d_state_features.shape[2])) + self.model[0].ops.scatter_add(d_tokvecs, ids, d_state_features) bp_tokvecs(d_tokvecs, sgd=sgd) From 783c0c87958e0af281f346de8d1957b93000c74a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 01:17:54 +0000 Subject: [PATCH 87/99] Remove unnecessary bz2 import --- spacy/vocab.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bcd1f3c10..1a91c2c0e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import bz2 import ujson import re import numpy From b9616419e1395745ce59288d01e591d72f80f0c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 01:18:05 +0000 Subject: [PATCH 88/99] Add try/except around bz2 import --- spacy/cli/model.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/cli/model.py b/spacy/cli/model.py index 14e75647e..bcc1626bc 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -1,8 +1,11 @@ # coding: utf8 from __future__ import unicode_literals -import bz2 -import gzip +try: + import bz2 + import gzip +except ImportError: + pass import math from ast import literal_eval from pathlib import Path From 4d272e25eeb2360c27a8adc6719e416e48b3a5de Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 03:55:04 +0200 Subject: [PATCH 89/99] Fix examples --- examples/pipeline/custom_attr_methods.py | 2 +- examples/pipeline/custom_component_countries_api.py | 2 +- examples/pipeline/custom_component_entities.py | 2 +- examples/training/train_parser.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py index 741541b06..18d6b482a 100644 --- a/examples/pipeline/custom_attr_methods.py +++ b/examples/pipeline/custom_attr_methods.py @@ -10,7 +10,7 @@ they're called on is passed in as the first argument. Developed for: spaCy 2.0.0a17 Last updated for: spaCy 2.0.0a18 """ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import plac from spacy.lang.en import English diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py index 38eec7384..e7371e205 100644 --- a/examples/pipeline/custom_component_countries_api.py +++ b/examples/pipeline/custom_component_countries_api.py @@ -11,7 +11,7 @@ coordinates. Can be extended with more details from the API. Developed for: spaCy 2.0.0a17 Last updated for: spaCy 2.0.0a18 """ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import requests import plac diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py index 050a89905..6b78744b7 100644 --- a/examples/pipeline/custom_component_entities.py +++ b/examples/pipeline/custom_component_entities.py @@ -11,7 +11,7 @@ respectively. Developed for: spaCy 2.0.0a17 Last updated for: spaCy 2.0.0a18 """ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import plac from spacy.lang.en import English diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 30a6f6095..a23d73ec7 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -90,7 +90,7 @@ def main(model=None, output_dir=None, n_iter=1000): nlp.to_disk(output_dir) print("Saved model to", output_dir) - # test the save model + # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc = nlp2(test_text) From 9dfca0f2f8fb53314dfe874fd327b07239669438 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 03:55:11 +0200 Subject: [PATCH 90/99] Add example for custom intent parser --- examples/training/train_intent_parser.py | 157 +++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 examples/training/train_intent_parser.py diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py new file mode 100644 index 000000000..e67f26aff --- /dev/null +++ b/examples/training/train_intent_parser.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# coding: utf-8 +"""Using the parser to recognise your own semantics spaCy's parser component +can be used to trained to predict any type of tree structure over your input +text. You can also predict trees over whole documents or chat logs, with +connections between the sentence-roots used to annotate discourse structure. + +In this example, we'll build a message parser for a common "chat intent": +finding local businesses. Our message semantics will have the following types +of relations: INTENT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. For example: + +"show me the best hotel in berlin" +('show', 'ROOT', 'show') +('best', 'QUALITY', 'hotel') --> hotel with QUALITY best +('hotel', 'PLACE', 'show') --> show PLACE hotel +('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin +""" +from __future__ import unicode_literals, print_function + +import plac +import random +import spacy +from spacy.gold import GoldParse +from spacy.tokens import Doc +from pathlib import Path + + +# training data: words, head and dependency labels +# for no relation, we simply chose an arbitrary dependency label, e.g. '-' +TRAIN_DATA = [ + ( + ['find', 'a', 'cafe', 'with', 'great', 'wifi'], + [0, 2, 0, 5, 5, 2], # index of token head + ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE'] + ), + ( + ['find', 'a', 'hotel', 'near', 'the', 'beach'], + [0, 2, 0, 5, 5, 2], + ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE'] + ), + ( + ['find', 'me', 'the', 'closest', 'gym', 'that', "'s", 'open', 'late'], + [0, 0, 4, 4, 0, 6, 4, 6, 6], + ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME'] + ), + ( + ['show', 'me', 'the', 'cheapest', 'store', 'that', 'sells', 'flowers'], + [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store! + ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT'] + ), + ( + ['find', 'a', 'nice', 'restaurant', 'in', 'london'], + [0, 3, 3, 0, 3, 3], + ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] + ), + ( + ['show', 'me', 'the', 'coolest', 'hostel', 'in', 'berlin'], + [0, 0, 4, 4, 0, 4, 4], + ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] + ), + ( + ['find', 'a', 'good', 'italian', 'restaurant', 'near', 'work'], + [0, 4, 4, 4, 0, 4, 5], + ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION'] + ) +] + + +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(model=None, output_dir=None, n_iter=100): + """Load the model, set up the pipeline and train the parser.""" + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") + + # add the parser to the pipeline if it doesn't exist + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'parser' not in nlp.pipe_names: + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser, first=True) + # otherwise, get it, so we can add labels to it + else: + parser = nlp.get_pipe('parser') + + for _, _, deps in TRAIN_DATA: + for dep in deps: + parser.add_label(dep) + + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] + with nlp.disable_pipes(*other_pipes): # only train parser + optimizer = nlp.begin_training(lambda: []) + for itn in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for words, heads, deps in TRAIN_DATA: + doc = Doc(nlp.vocab, words=words) + gold = GoldParse(doc, heads=heads, deps=deps) + nlp.update([doc], [gold], sgd=optimizer, losses=losses) + print(losses) + + # test the trained model + test_model(nlp) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + test_model(nlp2) + + +def test_model(nlp): + texts = ["find a hotel with good wifi", + "find me the cheapest gym near work", + "show me the best hotel in berlin"] + docs = nlp.pipe(texts) + for doc in docs: + print(doc.text) + print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) + + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # find a hotel with good wifi + # [ + # ('find', 'ROOT', 'find'), + # ('hotel', 'PLACE', 'find'), + # ('good', 'QUALITY', 'wifi'), + # ('wifi', 'ATTRIBUTE', 'hotel') + # ] + # find me the cheapest gym near work + # [ + # ('find', 'ROOT', 'find'), + # ('cheapest', 'QUALITY', 'gym'), + # ('gym', 'PLACE', 'find') + # ] + # show me the best hotel in berlin + # [ + # ('show', 'ROOT', 'show'), + # ('best', 'QUALITY', 'hotel'), + # ('hotel', 'PLACE', 'show'), + # ('berlin', 'LOCATION', 'hotel') + # ] From 954c88f4d899ee10fc46147ae0c3e46e9e87bb0a Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 04:48:41 +0200 Subject: [PATCH 91/99] Fix formatting --- website/usage/examples.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/usage/examples.jade b/website/usage/examples.jade index a97471dbe..9515e5ca3 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -136,7 +136,7 @@ include ../_includes/_mixins +github("spacy", "examples/training/train_textcat.py") +section("vectors") - +h(3, "fasttext") Loading pre-trained FastText vectors + +h(3, "fasttext") Loading pre-trained fastText vectors p | This simple snippet is all you need to be able to use the Facebook's From b5643d857572e1ffcc92df4d59de76e704de38ac Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 04:49:05 +0200 Subject: [PATCH 92/99] Update intent parser docs and add to usage docs --- examples/training/train_intent_parser.py | 14 +-- website/usage/_training/_tagger-parser.jade | 96 +++++++++++++++++++++ website/usage/examples.jade | 14 +++ 3 files changed, 117 insertions(+), 7 deletions(-) diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index e67f26aff..def0ed370 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -1,13 +1,13 @@ #!/usr/bin/env python # coding: utf-8 -"""Using the parser to recognise your own semantics spaCy's parser component -can be used to trained to predict any type of tree structure over your input -text. You can also predict trees over whole documents or chat logs, with -connections between the sentence-roots used to annotate discourse structure. +"""Using the parser to recognise your own semantics -In this example, we'll build a message parser for a common "chat intent": -finding local businesses. Our message semantics will have the following types -of relations: INTENT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. For example: +spaCy's parser component can be used to trained to predict any type of tree +structure over your input text. You can also predict trees over whole documents +or chat logs, with connections between the sentence-roots used to annotate +discourse structure. In this example, we'll build a message parser for a common +"chat intent": finding local businesses. Our message semantics will have the +following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. "show me the best hotel in berlin" ('show', 'ROOT', 'show') diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade index c32577a73..d8388f4d7 100644 --- a/website/usage/_training/_tagger-parser.jade +++ b/website/usage/_training/_tagger-parser.jade @@ -95,6 +95,102 @@ p +item | #[strong Test] the model to make sure the parser works as expected. ++h(3, "intent-parser") Training a parser for custom semantics + +p + | spaCy's parser component can be used to trained to predict any type + | of tree structure over your input text – including + | #[strong semantic relations] that are not syntactic dependencies. This + | can be useful to for #[strong conversational applications], which need to + | predict trees over whole documents or chat logs, with connections between + | the sentence roots used to annotate discourse structure. For example, you + | can train spaCy's parser to label intents and their targets, like + | attributes, quality, time and locations. The result could look like this: + ++codepen("991f245ef90debb78c8fc369294f75ad", 300) + ++code. + doc = nlp(u"find a hotel with good wifi") + print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) + # [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'), + # ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')] + +p + | The above tree attaches "wifi" to "hotel" and assigns the dependency + | label #[code ATTRIBUTE]. This may not be a correct syntactic dependency – + | but in this case, it expresses exactly what we need: the user is looking + | for a hotel with the attribute "wifi" of the quality "good". This query + | can then be processed by your application and used to trigger the + | respective action – e.g. search the database for hotels with high ratings + | for their wifi offerings. + ++aside("Tip: merge phrases and entities") + | To achieve even better accuracy, try merging multi-word tokens and + | entities specific to your domain into one token before parsing your text. + | You can do this by running the entity recognizer or + | #[+a("/usage/linguistic-features#rule-based-matching") rule-based matcher] + | to find relevant spans, and merging them using + | #[+api("span#merge") #[code Span.merge]]. You could even add your own + | custom #[+a("/usage/processing-pipelines#custom-components") pipeline component] + | to do this automatically – just make sure to add it #[code before='parser']. + +p + | The following example example shows a full implementation of a training + | loop for a custom message parser for a common "chat intent": finding + | local businesses. Our message semantics will have the following types + | of relations: #[code ROOT], #[code PLACE], #[code QUALITY], + | #[code ATTRIBUTE], #[code TIME] and #[code LOCATION]. + ++github("spacy", "examples/training/train_intent_parser.py") + ++h(4) Step by step guide + ++list("numbers") + +item + | #[strong Create the training data] consisting of words, their heads + | and their dependency labels in order. A token's head is the index + | of the token it is attached to. The heads don't need to be + | syntactically correct – they should express the + | #[strong semantic relations] you want the parser to learn. For words + | that shouldn't receive a label, you can choose an arbitrary + | placeholder, for example #[code -]. + + +item + | #[strong Load the model] you want to start with, or create an + | #[strong empty model] using + | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your + | language. If you're using a blank model, don't forget to add the + | parser to the pipeline. If you're using an existing model, + | make sure to disable all other pipeline components during training + | using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. + | This way, you'll only be training the parser. + + +item + | #[strong Add the dependency labels] to the parser using the + | #[+api("dependencyparser#add_label") #[code add_label]] method. + + +item + | #[strong Shuffle and loop over] the examples and create a + | #[code Doc] and #[code GoldParse] object for each example. Make sure + | to pass in the #[code heads] and #[code deps] when you create the + | #[code GoldParse]. + + +item + | For each example, #[strong update the model] + | by calling #[+api("language#update") #[code nlp.update]], which steps + | through the words of the input. At each word, it makes a + | #[strong prediction]. It then consults the annotations provided on the + | #[code GoldParse] instance, to see whether it was + | right. If it was wrong, it adjusts its weights so that the correct + | action will score higher next time. + + +item + | #[strong Save] the trained model using + | #[+api("language#to_disk") #[code nlp.to_disk]]. + + +item + | #[strong Test] the model to make sure the parser works as expected. + +h(3, "training-json") JSON format for training include ../../api/_annotation/_training diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 9515e5ca3..5e415af8f 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -122,6 +122,20 @@ include ../_includes/_mixins +github("spacy", "examples/training/train_tagger.py") + +h(3, "intent-parser") Training a custom parser for chat intent semantics + + p + | spaCy's parser component can be used to trained to predict any type + | of tree structure over your input text. You can also predict trees + | over whole documents or chat logs, with connections between the + | sentence-roots used to annotate discourse structure. In this example, + | we'll build a message parser for a common "chat intent": finding + | local businesses. Our message semantics will have the following types + | of relations: #[code ROOT], #[code PLACE], #[code QUALITY], + | #[code ATTRIBUTE], #[code TIME] and #[code LOCATION]. + + +github("spacy", "examples/training/train_intent_parser.py") + +h(3, "textcat") Training spaCy's text classifier +tag-new(2) From f6fef30adc217ed84dc658bc849cdee039663750 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:16:41 +0000 Subject: [PATCH 93/99] Remove dead code from spacy._ml --- spacy/_ml.py | 71 ++-------------------------------------------------- 1 file changed, 2 insertions(+), 69 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index b85f6ef9d..dd80e5b1a 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -348,58 +348,12 @@ def reapply(layer, n_times): return wrap(reapply_fwd, layer) - - def asarray(ops, dtype): def forward(X, drop=0.): return ops.asarray(X, dtype=dtype), None return layerize(forward) -def foreach(layer): - def forward(Xs, drop=0.): - results = [] - backprops = [] - for X in Xs: - result, bp = layer.begin_update(X, drop=drop) - results.append(result) - backprops.append(bp) - def backward(d_results, sgd=None): - dXs = [] - for d_result, backprop in zip(d_results, backprops): - dXs.append(backprop(d_result, sgd)) - return dXs - return results, backward - model = layerize(forward) - model._layers.append(layer) - return model - - -def rebatch(size, layer): - ops = layer.ops - def forward(X, drop=0.): - if X.shape[0] < size: - return layer.begin_update(X) - parts = _divide_array(X, size) - results, bp_results = zip(*[layer.begin_update(p, drop=drop) - for p in parts]) - y = ops.flatten(results) - def backward(dy, sgd=None): - d_parts = [bp(y, sgd=sgd) for bp, y in - zip(bp_results, _divide_array(dy, size))] - try: - dX = ops.flatten(d_parts) - except TypeError: - dX = None - except ValueError: - dX = None - return dX - return y, backward - model = layerize(forward) - model._layers.append(layer) - return model - - def _divide_array(X, size): parts = [] index = 0 @@ -508,11 +462,13 @@ def preprocess_doc(docs, drop=0.): vals = ops.allocate(keys.shape[0]) + 1 return (keys, vals, lengths), None + def getitem(i): def getitem_fwd(X, drop=0.): return X[i], None return layerize(getitem_fwd) + def build_tagger_model(nr_class, **cfg): embed_size = util.env_opt('embed_size', 7000) if 'token_vector_width' in cfg: @@ -552,29 +508,6 @@ def SpacyVectors(docs, drop=0.): return batch, None -def foreach(layer, drop_factor=1.0): - '''Map a layer across elements in a list''' - def foreach_fwd(Xs, drop=0.): - drop *= drop_factor - ys = [] - backprops = [] - for X in Xs: - y, bp_y = layer.begin_update(X, drop=drop) - ys.append(y) - backprops.append(bp_y) - def foreach_bwd(d_ys, sgd=None): - d_Xs = [] - for d_y, bp_y in zip(d_ys, backprops): - if bp_y is not None and bp_y is not None: - d_Xs.append(d_y, sgd=sgd) - else: - d_Xs.append(None) - return d_Xs - return ys, foreach_bwd - model = wrap(foreach_fwd, layer) - return model - - def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) pretrained_dims = cfg.get('pretrained_dims', 0) From 642eb28c168ae1251459bf0a8960cf68cdc1004b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:16:58 +0000 Subject: [PATCH 94/99] Don't compile with OpenMP by default --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2e2b816b7..a33826c23 100755 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ LINK_OPTIONS = { # I don't understand this very well yet. See Issue #267 # Fingers crossed! -USE_OPENMP_DEFAULT = '1' if sys.platform != 'darwin' else None +USE_OPENMP_DEFAULT = '0' if sys.platform != 'darwin' else None if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1': if sys.platform == 'darwin': COMPILE_OPTIONS['other'].append('-fopenmp') From c9987cf131a5cc8d41437136dad1c765f20e5862 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:18:36 +0000 Subject: [PATCH 95/99] Avoid use of numpy.tensordot --- spacy/_ml.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index dd80e5b1a..de2bd4b86 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -127,24 +127,34 @@ class PrecomputableAffine(Model): self.nF = nF def begin_update(self, X, drop=0.): - tensordot = self.ops.xp.tensordot - ascontiguous = self.ops.xp.ascontiguousarray - - Yf = tensordot(X, self.W, axes=[[1], [3]]) + Yf = self.ops.dot(X, + self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T) + + Yf = Yf.reshape((X.shape[0], self.nF, self.nO, self.nP)) def backward(dY_ids, sgd=None): dY, ids = dY_ids Xf = X[ids] + Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI)) - dXf = tensordot(dY, self.W, axes=[[1,2], [1,2]]) - dW = tensordot(dY, Xf, axes=[[0], [0]]) - # (o, p, f, i) --> (f, o, p, i) - self.d_W += dW.transpose((2, 0, 1, 3)) self.d_b += dY.sum(axis=0) + dY = dY.reshape((dY.shape[0], self.nO*self.nP)) + + Wopfi = self.W.transpose((1, 2, 0, 3)) + Wopfi = self.ops.xp.ascontiguousarray(Wopfi) + Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI)) + dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi) + + # Reuse the buffer + dWopfi = Wopfi; dWopfi.fill(0.) + self.ops.xp.dot(dY.T, Xf, out=dWopfi) + dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI)) + # (o, p, f, i) --> (f, o, p, i) + self.d_W += dWopfi.transpose((2, 0, 1, 3)) if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) - return dXf + return dXf.reshape((dXf.shape[0], self.nF, self.nI)) return Yf, backward @staticmethod @@ -168,9 +178,9 @@ class PrecomputableAffine(Model): size=tokvecs.size).reshape(tokvecs.shape) def predict(ids, tokvecs): - hiddens = model(tokvecs) + hiddens = model(tokvecs) # (b, f, o, p) vector = model.ops.allocate((hiddens.shape[0], model.nO, model.nP)) - model.ops.scatter_add(vector, ids, hiddens) + model.ops.xp.add.at(vector, ids, hiddens) vector += model.b if model.nP >= 2: return model.ops.maxout(vector)[0] @@ -318,8 +328,7 @@ def Tok2Vec(width, embed_size, **kwargs): tok2vec = ( FeatureExtracter(cols) - >> with_flatten( - embed >> (convolution ** 4), pad=4) + >> with_flatten(embed >> (convolution ** 4), pad=4) ) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 From 75a637fa439893d4d60e23a9aa3e2af241faf84a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:19:56 +0000 Subject: [PATCH 96/99] Remove redundant imports from _ml --- spacy/pipeline.pyx | 2 +- spacy/syntax/nn_parser.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 7c1976dfa..685c8ee00 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -42,7 +42,7 @@ from .syntax import nonproj from .compat import json_dumps from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS -from ._ml import rebatch, Tok2Vec, flatten +from ._ml import Tok2Vec, flatten from ._ml import build_text_classifier, build_tagger_model from ._ml import link_vectors_to_models from .parts_of_speech import X diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 96fdbab6d..773ab4e63 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -47,7 +47,7 @@ from thinc.neural.util import get_array_module from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine -from .._ml import Tok2Vec, doc2feats, rebatch +from .._ml import Tok2Vec, doc2feats from .._ml import Residual, drop_layer, flatten from .._ml import link_vectors_to_models from .._ml import HistoryFeatures From 4d048e94d3eaa88e038e56967c0bf7599d11f6ae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:23:49 +0000 Subject: [PATCH 97/99] Add compat for thinc.neural.optimizers.Optimizer --- spacy/compat.py | 4 ++++ spacy/language.py | 11 ++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index 81243ce1b..31b33e771 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -30,6 +30,10 @@ try: except ImportError: cupy = None +try: + from thinc.optimizers import Optimizer +except ImportError: + from thinc.optimizers import Adam as Optimizer pickle = pickle copy_reg = copy_reg diff --git a/spacy/language.py b/spacy/language.py index 933ca772d..adc2860eb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -3,7 +3,6 @@ from __future__ import absolute_import, unicode_literals from contextlib import contextmanager from thinc.neural import Model -from thinc.neural.optimizers import Adam import random import ujson from collections import OrderedDict @@ -21,6 +20,7 @@ from .syntax.parser import get_templates from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer +from .compat import Optimizer from .compat import json_dumps, izip, copy_reg from .scorer import Scorer from ._ml import link_vectors_to_models @@ -359,7 +359,8 @@ class Language(object): return if sgd is None: if self._optimizer is None: - self._optimizer = Adam(Model.ops, 0.001) + self._optimizer = Optimizer(Model.ops, 0.001, + beta1=0.9, beta2=0.0, nesterov=True) sgd = self._optimizer grads = {} def get_grads(W, dW, key=None): @@ -400,8 +401,8 @@ class Language(object): eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) - self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, - beta2=beta2, eps=eps) + self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1, + beta2=beta2, eps=eps, nesterov=True) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device return self._optimizer @@ -440,7 +441,7 @@ class Language(object): eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) - self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, + self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device From 52f1bf2729bf62b05fa554d567986cc6b852fb44 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 12:30:59 +0200 Subject: [PATCH 98/99] Adjust GitHub embeds --- website/_includes/_mixins.jade | 2 +- website/usage/_processing-pipelines/_custom-components.jade | 4 ++-- website/usage/_processing-pipelines/_multithreading.jade | 2 +- website/usage/_training/_ner.jade | 4 ++-- website/usage/_training/_tagger-parser.jade | 6 +++--- website/usage/_training/_textcat.jade | 2 +- website/usage/examples.jade | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index b7375e2e0..692b47887 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -181,7 +181,7 @@ mixin codepen(slug, height, default_tab) alt_file - [string] alternative file path used in footer and link button height - [integer] height of code preview in px -mixin github(repo, file, alt_file, height, language) +mixin github(repo, file, height, alt_file, language) - var branch = ALPHA ? "develop" : "master" - var height = height || 250 diff --git a/website/usage/_processing-pipelines/_custom-components.jade b/website/usage/_processing-pipelines/_custom-components.jade index ea3ea9b97..79cd77eef 100644 --- a/website/usage/_processing-pipelines/_custom-components.jade +++ b/website/usage/_processing-pipelines/_custom-components.jade @@ -234,7 +234,7 @@ p | when you customise spaCy's tokenization rules. When you call #[code nlp] | on a text, the custom pipeline component is applied to the #[code Doc] -+github("spacy", "examples/pipeline/custom_component_entities.py", false, 500) ++github("spacy", "examples/pipeline/custom_component_entities.py", 500) p | Wrapping this functionality in a @@ -255,7 +255,7 @@ p | #[code Token] – for example, the capital, latitude/longitude coordinates | and even the country flag. -+github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500) ++github("spacy", "examples/pipeline/custom_component_countries_api.py", 500) p | In this case, all data can be fetched on initialisation in one request. diff --git a/website/usage/_processing-pipelines/_multithreading.jade b/website/usage/_processing-pipelines/_multithreading.jade index a80768f38..4dff9c924 100644 --- a/website/usage/_processing-pipelines/_multithreading.jade +++ b/website/usage/_processing-pipelines/_multithreading.jade @@ -50,4 +50,4 @@ p | dataset and will be loaded automatically via Thinc's built-in dataset | loader. -+github("spacy", "examples/pipeline/multi_processing.py") ++github("spacy", "examples/pipeline/multi_processing.py", 500) diff --git a/website/usage/_training/_ner.jade b/website/usage/_training/_ner.jade index 12f92dbce..c1002ecdf 100644 --- a/website/usage/_training/_ner.jade +++ b/website/usage/_training/_ner.jade @@ -34,7 +34,7 @@ p | #[strong character offsets] and #[strong labels] of each entity contained | in the texts. - +github("spacy", "examples/training/train_ner.py") + +github("spacy", "examples/training/train_ner.py", 500) +h(4) Step by step guide @@ -88,7 +88,7 @@ p | recognizer over unlabelled sentences, and adding their annotations to the | training set. -+github("spacy", "examples/training/train_new_entity_type.py") ++github("spacy", "examples/training/train_new_entity_type.py", 500) +h(4) Step by step guide diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade index d8388f4d7..f2fa4bab5 100644 --- a/website/usage/_training/_tagger-parser.jade +++ b/website/usage/_training/_tagger-parser.jade @@ -8,7 +8,7 @@ p | #[strong training examples] and the respective #[strong heads] and | #[strong dependency label] for each token of the example texts. -+github("spacy", "examples/training/train_parser.py") ++github("spacy", "examples/training/train_parser.py", 500) +h(4) Step by step guide @@ -61,7 +61,7 @@ p | #[strong custom tags], as well as a dictionary mapping those tags to the | #[+a("http://universaldependencies.github.io/docs/u/pos/index.html") Universal Dependencies scheme]. -+github("spacy", "examples/training/train_tagger.py") ++github("spacy", "examples/training/train_tagger.py", 500) +h(4) Step by step guide @@ -141,7 +141,7 @@ p | of relations: #[code ROOT], #[code PLACE], #[code QUALITY], | #[code ATTRIBUTE], #[code TIME] and #[code LOCATION]. -+github("spacy", "examples/training/train_intent_parser.py") ++github("spacy", "examples/training/train_intent_parser.py", 500) +h(4) Step by step guide diff --git a/website/usage/_training/_textcat.jade b/website/usage/_training/_textcat.jade index 5ccff7a84..b7b47c3ba 100644 --- a/website/usage/_training/_textcat.jade +++ b/website/usage/_training/_textcat.jade @@ -11,7 +11,7 @@ p | loader. Predictions are available via | #[+api("doc#attributes") #[code Doc.cats]]. -+github("spacy", "examples/training/train_textcat.py") ++github("spacy", "examples/training/train_textcat.py", 500) +h(4) Step by step guide diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 5e415af8f..9ad800954 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -179,4 +179,4 @@ include ../_includes/_mixins | parameters, and was implemented using #[+a("https://keras.io") Keras] | and spaCy. - +github("spacy", "examples/keras_parikh_entailment/__main__.py", "examples/keras_parikh_entailment") + +github("spacy", "examples/keras_parikh_entailment/__main__.py", false, "examples/keras_parikh_entailment") From 19a2b9bf27f768a2c3f8c8033b1679e950b493a6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 12:33:42 +0000 Subject: [PATCH 99/99] Fix import of Optimizer --- spacy/compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index 31b33e771..8dd3d6b03 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -31,9 +31,9 @@ except ImportError: cupy = None try: - from thinc.optimizers import Optimizer + from thinc.neural.optimizers import Optimizer except ImportError: - from thinc.optimizers import Adam as Optimizer + from thinc.neural.optimizers import Adam as Optimizer pickle = pickle copy_reg = copy_reg