From e4a45ae55fba89a65fc0851783fd712ae6d1755d Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 12 Jun 2017 12:28:51 +0200 Subject: [PATCH 001/110] Very minor documentation fix --- website/docs/usage/customizing-tokenizer.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index b1fbba652..354a56c22 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -214,7 +214,7 @@ p def __call__(self, text): words = text.split(' ') # All tokens 'own' a subsequent space character in this tokenizer - spaces = [True] * len(word) + spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces) p From d19ce29a23de1805be3bb2b0a694a38d671fdfb3 Mon Sep 17 00:00:00 2001 From: Ian Mobbs Date: Mon, 12 Jun 2017 13:21:44 -0400 Subject: [PATCH 002/110] Create requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 8194dee58..20c587841 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ thinc>=6.5.0,<6.6.0 murmurhash>=0.26,<0.27 plac<1.0.0,>=0.9.6 six +html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 From 81166c3d563bf5c3ca86924b06c4fd44dd6e3a11 Mon Sep 17 00:00:00 2001 From: Nathan Glenn Date: Wed, 21 Jun 2017 19:22:30 +0200 Subject: [PATCH 003/110] fix confusing typo This document describes the `Vocab` class, not the `Span` class. --- website/docs/api/vocab.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/vocab.jade b/website/docs/api/vocab.jade index 7490bccf4..c036c650b 100644 --- a/website/docs/api/vocab.jade +++ b/website/docs/api/vocab.jade @@ -124,7 +124,7 @@ p +cell #[code Lexeme] +cell The lexeme indicated by the given ID. -+h(2, "iter") Span.__iter__ ++h(2, "iter") Vocab.__iter__ +tag method p Iterate over the lexemes in the vocabulary. From f69ff1508959e60ced2a0bf329aae07710bc9bde Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 27 Jun 2017 14:49:02 +0200 Subject: [PATCH 004/110] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ea6096a52..c419a03cf 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -25,6 +25,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Ines Montani, [@ines](https://github.com/ines) * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) +* Jim Regan, [@jimregan](https://github.com/jimregan) * Jordan Suchow, [@suchow](https://github.com/suchow) * Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) From 84041a2bb517841d725781bdd72b1daf4f8e603d Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 28 Jun 2017 01:18:05 +0900 Subject: [PATCH 005/110] Make create_tokenizer work with Japanese --- spacy/ja/__init__.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 07e40ada6..1c85ded95 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function from os import path -from ..language import Language +from ..language import Language, BaseDefaults +from ..tokenizer import Tokenizer from ..attrs import LANG from ..tokens import Doc from .language_data import * - -class Japanese(Language): - lang = 'ja' - - def make_doc(self, text): +class JapaneseTokenizer(object): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: from janome.tokenizer import Tokenizer except ImportError: raise ImportError("The Japanese tokenizer requires the Janome library: " "https://github.com/mocobeta/janome") - words = [x.surface for x in Tokenizer().tokenize(text)] + self.tokenizer = Tokenizer() + + def __call__(self, text): + words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) + +class JapaneseDefaults(BaseDefaults): + @classmethod + def create_tokenizer(cls, nlp=None): + return JapaneseTokenizer(cls, nlp) + +class Japanese(Language): + lang = 'ja' + + Defaults = JapaneseDefaults + + def make_doc(self, text): + words = self.tokenizer(text) + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + + From e56fea14eb7e807d5ea4ee5fdd12f7ca0610690a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 28 Jun 2017 01:24:25 +0900 Subject: [PATCH 006/110] Add basic Japanese tokenizer test --- spacy/tests/conftest.py | 8 +++++++- spacy/tests/ja/__init__.py | 0 spacy/tests/ja/test_tokenizer.py | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/ja/__init__.py create mode 100644 spacy/tests/ja/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b8ada1d9a..b0f11b5a4 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -5,6 +5,7 @@ from ..en import English from ..de import German from ..es import Spanish from ..it import Italian +from ..ja import Japanese from ..fr import French from ..pt import Portuguese from ..nl import Dutch @@ -27,7 +28,7 @@ import os import pytest -LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, +LANGUAGES = [English, German, Spanish, Italian, Japanese, French, Portuguese, Dutch, Swedish, Hungarian, Finnish, Bengali, Norwegian] @@ -76,6 +77,11 @@ def fi_tokenizer(): return Finnish.Defaults.create_tokenizer() +@pytest.fixture +def ja_tokenizer(): + return Japanese.Defaults.create_tokenizer() + + @pytest.fixture def sv_tokenizer(): return Swedish.Defaults.create_tokenizer() diff --git a/spacy/tests/ja/__init__.py b/spacy/tests/ja/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py new file mode 100644 index 000000000..8d45c822d --- /dev/null +++ b/spacy/tests/ja/test_tokenizer.py @@ -0,0 +1,8 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +def test_japanese_tokenizer(ja_tokenizer): + tokens = ja_tokenizer("日本語だよ") + assert len(tokens) == 3 From 1b3a5d87bad69dcb8ec9cdb26ec030f7894708ec Mon Sep 17 00:00:00 2001 From: Alexis Date: Wed, 28 Jun 2017 14:11:20 +0200 Subject: [PATCH 007/110] French NUM_WORDS and ORDINAL_WORDS --- spacy/fr/stop_words.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/spacy/fr/stop_words.py b/spacy/fr/stop_words.py index d9b820537..71f124d6c 100644 --- a/spacy/fr/stop_words.py +++ b/spacy/fr/stop_words.py @@ -86,3 +86,28 @@ votre vous vous-mêmes vu vé vôtre vôtres zut """.split()) + + + +# Number words + +NUM_WORDS = set(""" +zero un deux trois quatre cinq six sept huit neuf dix +onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf +vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante +cent mille mil million milliard billion quadrillion quintillion +sextillion septillion octillion nonillion decillion +""".split()) + +# Ordinal words + +ORDINAL_WORDS = set(""" +premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième +onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième +vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième +centième millième millionnième milliardième billionnième quadrillionnième quintillionnième +sextillionnième septillionnième octillionnième nonillionnième decillionnième +""".split()) + + + From 30a34ebb6edb513e262d1f47b6742b4480282f3c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 29 Jun 2017 00:09:20 +0900 Subject: [PATCH 008/110] Add importorskip for janome --- spacy/tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b0f11b5a4..222f9aa1d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -79,6 +79,7 @@ def fi_tokenizer(): @pytest.fixture def ja_tokenizer(): + janome = pytest.importorskip("janome") return Japanese.Defaults.create_tokenizer() From c33619339217dbeff75243d7493dc60685ddf28c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 29 Jun 2017 00:09:40 +0900 Subject: [PATCH 009/110] Parametrize and extend Japanese tokenizer tests --- spacy/tests/ja/test_tokenizer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py index 8d45c822d..58700b353 100644 --- a/spacy/tests/ja/test_tokenizer.py +++ b/spacy/tests/ja/test_tokenizer.py @@ -3,6 +3,15 @@ from __future__ import unicode_literals import pytest -def test_japanese_tokenizer(ja_tokenizer): - tokens = ja_tokenizer("日本語だよ") - assert len(tokens) == 3 +TOKENIZER_TESTS = [ + ("日本語だよ", ['日本語', 'だ', 'よ']), + ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']), + ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']), + ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']), + ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']) +] + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens): + tokens = [token.text for token in ja_tokenizer(text)] + assert tokens == expected_tokens From dfaeee1f37d8b7b614e55cd732c6c89abb9afd92 Mon Sep 17 00:00:00 2001 From: Callum Kift Date: Fri, 30 Jun 2017 09:56:33 +0200 Subject: [PATCH 010/110] fixed bug in training ner documentation and example --- examples/training/train_new_entity_type.py | 2 +- website/docs/usage/training-ner.jade | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 4eae11c75..987ab5859 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -52,6 +52,7 @@ def train_ner(nlp, train_data, output_dir): random.shuffle(train_data) loss = 0. for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) # By default, the GoldParse class assumes that the entities # described by offset are complete, and all other words should @@ -63,7 +64,6 @@ def train_ner(nlp, train_data, output_dir): #for i in range(len(gold.ner)): #if not gold.ner[i].endswith('ANIMAL'): # gold.ner[i] = '-' - doc = nlp.make_doc(raw_text) nlp.tagger(doc) # As of 1.9, spaCy's parser now lets you supply a dropout probability # This might help the model generalize better from only a few diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 78eb4905e..52eedd21e 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -150,8 +150,8 @@ p for itn in range(20): random.shuffle(train_data) for raw_text, entity_offsets in train_data: - gold = GoldParse(doc, entities=entity_offsets) doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) nlp.tagger(doc) loss = nlp.entity.update(doc, gold) nlp.end_training() From 669bd142130f3e3c66b253efd0df1dd7ce2ba3f4 Mon Sep 17 00:00:00 2001 From: gispk47 Date: Sat, 1 Jul 2017 13:12:00 +0800 Subject: [PATCH 011/110] Update __init__.py remove the empty string return from jieba.cut,this will cause the list of tokens cant be pushed assert error --- spacy/zh/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py index 1847a7d8d..0f407dec6 100644 --- a/spacy/zh/__init__.py +++ b/spacy/zh/__init__.py @@ -8,4 +8,5 @@ class Chinese(Language): def make_doc(self, text): import jieba words = list(jieba.cut(text, cut_all=True)) + words=[x for x in words if x] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From 5357874bf74b05a40961ba05936f6009453a48b8 Mon Sep 17 00:00:00 2001 From: Swier Date: Wed, 5 Jul 2017 14:03:30 +0200 Subject: [PATCH 012/110] add Dutch numbers and ordinals --- spacy/nl/stop_words.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/spacy/nl/stop_words.py b/spacy/nl/stop_words.py index 22f1d714c..d19515262 100644 --- a/spacy/nl/stop_words.py +++ b/spacy/nl/stop_words.py @@ -41,3 +41,22 @@ want waren was wat we wel werd wezen wie wij wil worden zal ze zei zelf zich zij zijn zo zonder zou """.split()) + + +# Number words + +NUM_WORDS = set(""" +nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien +veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd +duizend miljoen miljard biljoen biljard triljoen triljard +""".split()) + + +# Ordinal words + +ORDINAL_WORDS = set(""" +eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde +twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste +zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste +miljardste biljoenste biljardste triljoenste triljardste +""".split()) From f377c9c952ed6b42086c0ee9fcedb5a67af963b4 Mon Sep 17 00:00:00 2001 From: Swier Date: Wed, 5 Jul 2017 14:06:28 +0200 Subject: [PATCH 013/110] Rename stop_words.py to word_sets.py --- spacy/nl/{stop_words.py => word_sets.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/nl/{stop_words.py => word_sets.py} (100%) diff --git a/spacy/nl/stop_words.py b/spacy/nl/word_sets.py similarity index 100% rename from spacy/nl/stop_words.py rename to spacy/nl/word_sets.py From 29720150f9960c1a57b2d463d4653e0a8f3211e0 Mon Sep 17 00:00:00 2001 From: Swier Date: Wed, 5 Jul 2017 14:08:04 +0200 Subject: [PATCH 014/110] fix import of stop words in language data --- spacy/nl/language_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py index f9899d8d1..b3ca1aef9 100644 --- a/spacy/nl/language_data.py +++ b/spacy/nl/language_data.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .. import language_data as base from ..language_data import update_exc, strings_to_exc -from .stop_words import STOP_WORDS +from .word_sets import STOP_WORDS, NUM_WORDS STOP_WORDS = set(STOP_WORDS) From 19d4706f69b8788bffc43ab0bf07a80a1ed5bdab Mon Sep 17 00:00:00 2001 From: val314159 Date: Fri, 7 Jul 2017 13:18:17 -0700 Subject: [PATCH 015/110] make this work in python2.7 --- website/docs/usage/lightning-tour.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 138b0058d..2fd390d26 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -83,7 +83,7 @@ p +h(2, "examples-word-vectors") Word vectors +code. - doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") + doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.") apples = doc[0] oranges = doc[2] From 04e6a6518869b1ca15beb79694049e0fb164a2aa Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sun, 9 Jul 2017 16:23:26 +0900 Subject: [PATCH 016/110] Remove Japanese from LANGUAGES LANGUAGES is a list of languages whose tokenizers get run through a variety of generic tests. Since the generic tests don't check the JA fixture, it blows up when it can't find janome. -POLM --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 222f9aa1d..29d896a5d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -28,7 +28,7 @@ import os import pytest -LANGUAGES = [English, German, Spanish, Italian, Japanese, French, Portuguese, Dutch, +LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, Swedish, Hungarian, Finnish, Bengali, Norwegian] From bc87b815cc34d375e1a4b4c9b54c296691cee237 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sun, 9 Jul 2017 16:28:55 +0900 Subject: [PATCH 017/110] Add comment clarifying what LANGUAGES does --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 29d896a5d..6e00b1513 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -27,7 +27,7 @@ from pathlib import Path import os import pytest - +# These languages get run through generic tokenizer tests LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, Swedish, Hungarian, Finnish, Bengali, Norwegian] From 6cf26909438230b4f9626d6cf25a19ecd0d1555c Mon Sep 17 00:00:00 2001 From: lgenerknol Date: Wed, 12 Jul 2017 11:06:16 -0400 Subject: [PATCH 018/110] Missing markup char Frontend displayed: ``` If start_idx and do not mark[...] ``` Note the missing "end_idx" after 'and'. --- website/docs/api/doc.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index adcd111a3..1c2911f52 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -272,7 +272,7 @@ p Import the document contents from a binary string. p | Retokenize the document, such that the span at | #[code doc.text[start_idx : end_idx]] is merged into a single token. If - | #[code start_idx] and #[end_idx] do not mark start and end token + | #[code start_idx] and #[code end_idx] do not mark start and end token | boundaries, the document remains unchanged. +table(["Name", "Type", "Description"]) From 2b219caf0d01e98e10b82b940ba184a63ead64a5 Mon Sep 17 00:00:00 2001 From: lgenerknol Date: Wed, 12 Jul 2017 13:12:24 -0400 Subject: [PATCH 019/110] .../cli/#foo is 404 https://spacy.io/docs/usage/cli/#package is a 404. Changed to https://spacy.io/docs/usage/cli#package Definitely a larger fix possible to deal with trailing slashes --- website/docs/usage/saving-loading.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index c4eb08f04..8978cce7a 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -28,7 +28,7 @@ p | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + | #[+a("/docs/usage/cli#package") #[code package] command] documentation. +aside-code("meta.json", "json"). { From fadacd0d47a898173ae68bdfb758e688f7a176ce Mon Sep 17 00:00:00 2001 From: Jorge Paredes Date: Sun, 16 Jul 2017 10:06:32 -0500 Subject: [PATCH 020/110] Fix url broken The related url to **custom named entities** was broken --- website/docs/usage/models.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 9bb75ba9a..30863720c 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -203,7 +203,7 @@ p p | If you've trained your own model, for example for | #[+a("/docs/usage/adding-languages") additional languages] or - | #[+a("/docs/usage/train-ner") custom named entities], you can save its + | #[+a("/docs/usage/training-ner") custom named entities], you can save its | state using the #[code Language.save_to_directory()] method. To make the | model more convenient to deploy, we recommend wrapping it as a Python | package. From 8bb443be4fc63fd76e6ddf48008aacfe3a716398 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:28:51 +0200 Subject: [PATCH 021/110] Add standalone tagger training example --- examples/training/train_tagger_ud.py | 150 +++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 examples/training/train_tagger_ud.py diff --git a/examples/training/train_tagger_ud.py b/examples/training/train_tagger_ud.py new file mode 100644 index 000000000..3015c52e8 --- /dev/null +++ b/examples/training/train_tagger_ud.py @@ -0,0 +1,150 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import plac +import codecs +import spacy.symbols as symbols +import spacy +from pathlib import Path + +from spacy.vocab import Vocab +from spacy.tagger import Tagger +from spacy.tokens import Doc +from spacy.gold import GoldParse +from spacy.language import Language +from spacy import orth +from spacy import attrs + +import random + +TAG_MAP = { + 'ADJ': {symbols.POS: symbols.ADJ}, + 'ADP': {symbols.POS: symbols.ADP}, + 'PUNCT': {symbols.POS: symbols.PUNCT}, + 'ADV': {symbols.POS: symbols.ADV}, + 'AUX': {symbols.POS: symbols.AUX}, + 'SYM': {symbols.POS: symbols.SYM}, + 'INTJ': {symbols.POS: symbols.INTJ}, + 'CCONJ': {symbols.POS: symbols.CCONJ}, + 'X': {symbols.POS: symbols.X}, + 'NOUN': {symbols.POS: symbols.NOUN}, + 'DET': {symbols.POS: symbols.DET}, + 'PROPN': {symbols.POS: symbols.PROPN}, + 'NUM': {symbols.POS: symbols.NUM}, + 'VERB': {symbols.POS: symbols.VERB}, + 'PART': {symbols.POS: symbols.PART}, + 'PRON': {symbols.POS: symbols.PRON}, + 'SCONJ': {symbols.POS: symbols.SCONJ}, +} + +LEX_ATTR_GETTERS = { + attrs.LOWER: lambda string: string.lower(), + attrs.NORM: lambda string: string, + attrs.SHAPE: orth.word_shape, + attrs.PREFIX: lambda string: string[0], + attrs.SUFFIX: lambda string: string[-3:], + attrs.CLUSTER: lambda string: 0, + attrs.IS_ALPHA: orth.is_alpha, + attrs.IS_ASCII: orth.is_ascii, + attrs.IS_DIGIT: lambda string: string.isdigit(), + attrs.IS_LOWER: orth.is_lower, + attrs.IS_PUNCT: orth.is_punct, + attrs.IS_SPACE: lambda string: string.isspace(), + attrs.IS_TITLE: orth.is_title, + attrs.IS_UPPER: orth.is_upper, + attrs.IS_BRACKET: orth.is_bracket, + attrs.IS_QUOTE: orth.is_quote, + attrs.IS_LEFT_PUNCT: orth.is_left_punct, + attrs.IS_RIGHT_PUNCT: orth.is_right_punct, + attrs.LIKE_URL: orth.like_url, + attrs.LIKE_NUM: orth.like_number, + attrs.LIKE_EMAIL: orth.like_email, + attrs.IS_STOP: lambda string: False, + attrs.IS_OOV: lambda string: True +} + + +def read_ud_data(path): + data = [] + last_number = -1 + sentence_words = [] + sentence_tags = [] + with codecs.open(path, encoding="utf-8") as f: + while True: + line = f.readline() + if not line: + break + + if line[0].isdigit(): + d = line.split() + if not "-" in d[0]: + number = int(line[0]) + if number < last_number: + data.append((sentence_words, sentence_tags),) + sentence_words = [] + sentence_tags = [] + sentence_words.append(d[2]) + sentence_tags.append(d[3]) + last_number = number + if len(sentence_words) > 0: + data.append((sentence_words, sentence_tags,)) + return data + +def ensure_dir(path): + if not path.exists(): + path.mkdir() + + +def main(train_loc, dev_loc, output_dir=None): + if output_dir is not None: + output_dir = Path(output_dir) + ensure_dir(output_dir) + ensure_dir(output_dir / "pos") + ensure_dir(output_dir / "vocab") + + train_data = read_ud_data(train_loc) + vocab = Vocab(tag_map=TAG_MAP, lex_attr_getters=LEX_ATTR_GETTERS) + # Populate vocab + for words, _ in train_data: + for word in words: + _ = vocab[word] + + model = spacy.tagger.TaggerModel(spacy.tagger.Tagger.feature_templates) + tagger = Tagger(vocab, model) + print(tagger.tag_names) + for i in range(30): + print("training model (iteration " + str(i) + ")...") + score = 0. + num_samples = 0. + for words, tags in train_data: + doc = Doc(vocab, words=words) + gold = GoldParse(doc, tags=tags) + cost = tagger.update(doc, gold) + for i, word in enumerate(doc): + num_samples += 1 + if word.tag_ == tags[i]: + score += 1 + print('Train acc', score/num_samples) + random.shuffle(train_data) + tagger.model.end_training() + + score = 0.0 + test_data = read_ud_data(dev_loc) + num_samples = 0 + for words, tags in test_data: + doc = Doc(vocab, words) + tagger(doc) + for i, word in enumerate(doc): + num_samples += 1 + if word.tag_ == tags[i]: + score += 1 + print("score: " + str(score / num_samples * 100.0)) + + if output_dir is not None: + tagger.model.dump(str(output_dir / 'pos' / 'model')) + with (output_dir / 'vocab' / 'strings.json').open('w') as file_: + tagger.vocab.strings.dump(file_) + + +if __name__ == '__main__': + plac.call(main) From 3fef5f642bd7f40cbc41319e51e71579bde791f9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:29:15 +0200 Subject: [PATCH 022/110] Rename tagger training example --- .../{train_tagger_ud.py => train_tagger_standalone_ud.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/training/{train_tagger_ud.py => train_tagger_standalone_ud.py} (100%) diff --git a/examples/training/train_tagger_ud.py b/examples/training/train_tagger_standalone_ud.py similarity index 100% rename from examples/training/train_tagger_ud.py rename to examples/training/train_tagger_standalone_ud.py From a405660068f9f1c17a71a54866f475b2b13eef6c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:32:48 +0200 Subject: [PATCH 023/110] Add commit to tagger example --- examples/training/train_new_entity_type.py | 4 ++-- examples/training/train_tagger_standalone_ud.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 4eae11c75..6c432acdf 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -24,8 +24,8 @@ For more details, see the documentation: * Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner * Saving and loading models: https://spacy.io/docs/usage/saving-loading -Developed for: spaCy 1.7.6 -Last tested for: spaCy 1.7.6 +Developed for: spaCy 1.9.0 +Last tested for: spaCy 1.9.0 """ from __future__ import unicode_literals, print_function diff --git a/examples/training/train_tagger_standalone_ud.py b/examples/training/train_tagger_standalone_ud.py index 3015c52e8..ce1ab50d6 100644 --- a/examples/training/train_tagger_standalone_ud.py +++ b/examples/training/train_tagger_standalone_ud.py @@ -1,3 +1,17 @@ +''' +This example shows training of the POS tagger without the Language class, +showing the APIs of the atomic components. + +This example was adapted from the gist here: + +https://gist.github.com/kamac/a7bc139f62488839a8118214a4d932f2 + +Issue discussing the gist: + +https://github.com/explosion/spaCy/issues/1179 + +The example was written for spaCy 1.8.2. +''' from __future__ import unicode_literals from __future__ import print_function From 5916d46ba8a9c85f5f8c115bb831561e3c64d256 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:34:01 +0200 Subject: [PATCH 024/110] Avoid use of deepcopy in printer --- spacy/tokens/printers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index d70088540..487d74167 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -49,6 +49,7 @@ def parse_tree(doc, light=False, flat=False): >>> trees = doc.print_tree() [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] """ - doc_clone = deepcopy(doc) + doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) + doc_clone.from_array(doc.to_array([HEAD, DEP, TAG, ENT_IOB, ENT_TYPE]) merge_ents(doc_clone) # merge the entities into single tokens first return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents] From 8b581fdac515173f80a2b1560f2b58286d3c92e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:36:54 +0200 Subject: [PATCH 025/110] Remove unused example --- examples/chainer_sentiment.py | 322 ---------------------------------- 1 file changed, 322 deletions(-) delete mode 100644 examples/chainer_sentiment.py diff --git a/examples/chainer_sentiment.py b/examples/chainer_sentiment.py deleted file mode 100644 index 747ef508a..000000000 --- a/examples/chainer_sentiment.py +++ /dev/null @@ -1,322 +0,0 @@ -'''WIP --- Doesn't work well yet''' -import plac -import random -import six - -import cProfile -import pstats - -import pathlib -import cPickle as pickle -from itertools import izip - -import spacy - -import cytoolz -import cupy as xp -import cupy.cuda -import chainer.cuda - -import chainer.links as L -import chainer.functions as F -from chainer import Chain, Variable, report -import chainer.training -import chainer.optimizers -from chainer.training import extensions -from chainer.iterators import SerialIterator -from chainer.datasets import TupleDataset - - -class SentimentAnalyser(object): - @classmethod - def load(cls, path, nlp, max_length=100): - raise NotImplementedError - #with (path / 'config.json').open() as file_: - # model = model_from_json(file_.read()) - #with (path / 'model').open('rb') as file_: - # lstm_weights = pickle.load(file_) - #embeddings = get_embeddings(nlp.vocab) - #model.set_weights([embeddings] + lstm_weights) - #return cls(model, max_length=max_length) - - def __init__(self, model, max_length=100): - self._model = model - self.max_length = max_length - - def __call__(self, doc): - X = get_features([doc], self.max_length) - y = self._model.predict(X) - self.set_sentiment(doc, y) - - def pipe(self, docs, batch_size=1000, n_threads=2): - for minibatch in cytoolz.partition_all(batch_size, docs): - minibatch = list(minibatch) - sentences = [] - for doc in minibatch: - sentences.extend(doc.sents) - Xs = get_features(sentences, self.max_length) - ys = self._model.predict(Xs) - for sent, label in zip(sentences, ys): - sent.doc.sentiment += label - 0.5 - for doc in minibatch: - yield doc - - def set_sentiment(self, doc, y): - doc.sentiment = float(y[0]) - # Sentiment has a native slot for a single float. - # For arbitrary data storage, there's: - # doc.user_data['my_data'] = y - - -class Classifier(Chain): - def __init__(self, predictor): - super(Classifier, self).__init__(predictor=predictor) - - def __call__(self, x, t): - y = self.predictor(x) - loss = F.softmax_cross_entropy(y, t) - accuracy = F.accuracy(y, t) - report({'loss': loss, 'accuracy': accuracy}, self) - return loss - - -class SentimentModel(Chain): - def __init__(self, nlp, shape, **settings): - Chain.__init__(self, - embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'], - set_vectors=lambda arr: set_vectors(arr, nlp.vocab)), - encode=_Encode(shape['nr_hidden'], shape['nr_hidden']), - attend=_Attend(shape['nr_hidden'], shape['nr_hidden']), - predict=_Predict(shape['nr_hidden'], shape['nr_class'])) - self.to_gpu(0) - - def __call__(self, sentence): - return self.predict( - self.attend( - self.encode( - self.embed(sentence)))) - - -class _Embed(Chain): - def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None): - Chain.__init__(self, - embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors), - project=L.Linear(None, nr_out, nobias=True)) - self.embed.W.volatile = False - - def __call__(self, sentence): - return [self.project(self.embed(ts)) for ts in F.transpose(sentence)] - - -class _Encode(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - fwd=L.LSTM(nr_in, nr_out), - bwd=L.LSTM(nr_in, nr_out), - mix=L.Bilinear(nr_out, nr_out, nr_out)) - - def __call__(self, sentence): - self.fwd.reset_state() - fwds = map(self.fwd, sentence) - self.bwd.reset_state() - bwds = reversed(map(self.bwd, reversed(sentence))) - return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)] - - -class _Attend(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self) - - def __call__(self, sentence): - sent = sum(sentence) - return sent - - -class _Predict(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - l1=L.Linear(nr_in, nr_in), - l2=L.Linear(nr_in, nr_out)) - - def __call__(self, vector): - vector = self.l1(vector) - vector = F.elu(vector) - vector = self.l2(vector) - return vector - - -class SentenceDataset(TupleDataset): - def __init__(self, nlp, texts, labels, max_length): - self.max_length = max_length - sents, labels = self._get_labelled_sentences( - nlp.pipe(texts, batch_size=5000, n_threads=3), - labels) - TupleDataset.__init__(self, - get_features(sents, max_length), - labels) - - def __getitem__(self, index): - batches = [dataset[index] for dataset in self._datasets] - if isinstance(index, slice): - length = len(batches[0]) - returns = [tuple([batch[i] for batch in batches]) - for i in six.moves.range(length)] - return returns - else: - return tuple(batches) - - def _get_labelled_sentences(self, docs, doc_labels): - labels = [] - sentences = [] - for doc, y in izip(docs, doc_labels): - for sent in doc.sents: - sentences.append(sent) - labels.append(y) - return sentences, xp.asarray(labels, dtype='i') - - -class DocDataset(TupleDataset): - def __init__(self, nlp, texts, labels): - self.max_length = max_length - DatasetMixin.__init__(self, - get_features( - nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length), - labels) - -def read_data(data_dir, limit=0): - examples = [] - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - examples.append((text, label)) - random.shuffle(examples) - if limit >= 1: - examples = examples[:limit] - return zip(*examples) # Unzips into two lists - - -def get_features(docs, max_length): - docs = list(docs) - Xs = xp.zeros((len(docs), max_length), dtype='i') - for i, doc in enumerate(docs): - j = 0 - for token in doc: - if token.has_vector and not token.is_punct and not token.is_space: - Xs[i, j] = token.norm - j += 1 - if j >= max_length: - break - return Xs - - -def set_vectors(vectors, vocab): - for lex in vocab: - if lex.has_vector and (lex.rank+1) < vectors.shape[0]: - lex.norm = lex.rank+1 - vectors[lex.rank + 1] = lex.vector - else: - lex.norm = 0 - return vectors - - -def train(train_texts, train_labels, dev_texts, dev_labels, - lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, - by_sentence=True): - nlp = spacy.load('en', entity=False) - if 'nr_vector' not in lstm_shape: - lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector) - if 'nr_dim' not in lstm_shape: - lstm_shape['nr_dim'] = nlp.vocab.vectors_length - print("Make model") - model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings)) - print("Parsing texts...") - if by_sentence: - train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length']) - dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length']) - else: - train_data = DocDataset(nlp, train_texts, train_labels) - dev_data = DocDataset(nlp, dev_texts, dev_labels) - train_iter = SerialIterator(train_data, batch_size=batch_size, - shuffle=True, repeat=True) - dev_iter = SerialIterator(dev_data, batch_size=batch_size, - shuffle=False, repeat=False) - optimizer = chainer.optimizers.Adam() - optimizer.setup(model) - updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0) - trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result') - - trainer.extend(extensions.Evaluator(dev_iter, model, device=0)) - trainer.extend(extensions.LogReport()) - trainer.extend(extensions.PrintReport([ - 'epoch', 'main/accuracy', 'validation/main/accuracy'])) - trainer.extend(extensions.ProgressBar()) - - trainer.run() - - -def evaluate(model_dir, texts, labels, max_length=100): - def create_pipeline(nlp): - ''' - This could be a lambda, but named functions are easier to read in Python. - ''' - return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp, - max_length=max_length)] - - nlp = spacy.load('en') - nlp.pipeline = create_pipeline(nlp) - - correct = 0 - i = 0 - for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): - correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) - i += 1 - return float(correct) / i - - -@plac.annotations( - train_dir=("Location of training file or directory"), - dev_dir=("Location of development file or directory"), - model_dir=("Location of output model directory",), - is_runtime=("Demonstrate run-time usage", "flag", "r", bool), - nr_hidden=("Number of hidden units", "option", "H", int), - max_length=("Maximum sentence length", "option", "L", int), - dropout=("Dropout", "option", "d", float), - learn_rate=("Learn rate", "option", "e", float), - nb_epoch=("Number of training epochs", "option", "i", int), - batch_size=("Size of minibatches for training LSTM", "option", "b", int), - nr_examples=("Limit to N examples", "option", "n", int) -) -def main(model_dir, train_dir, dev_dir, - is_runtime=False, - nr_hidden=64, max_length=100, # Shape - dropout=0.5, learn_rate=0.001, # General NN config - nb_epoch=5, batch_size=32, nr_examples=-1): # Training params - model_dir = pathlib.Path(model_dir) - train_dir = pathlib.Path(train_dir) - dev_dir = pathlib.Path(dev_dir) - if is_runtime: - dev_texts, dev_labels = read_data(dev_dir) - acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) - print(acc) - else: - print("Read data") - train_texts, train_labels = read_data(train_dir, limit=nr_examples) - dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) - print("Using GPU 0") - #chainer.cuda.get_device(0).use() - train_labels = xp.asarray(train_labels, dtype='i') - dev_labels = xp.asarray(dev_labels, dtype='i') - lstm = train(train_texts, train_labels, dev_texts, dev_labels, - {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2, - 'nr_vector': 5000}, - {'dropout': 0.5, 'lr': learn_rate}, - {}, - nb_epoch=nb_epoch, batch_size=batch_size) - - -if __name__ == '__main__': - #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - #s = pstats.Stats("Profile.prof") - #s.strip_dirs().sort_stats("time").print_stats() - plac.call(main) From 69396dcfd35cf40c9706bf1199f3de8b8e7a06a5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 22 Jul 2017 13:43:15 +0200 Subject: [PATCH 026/110] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index c419a03cf..bfdbf5c4f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -3,6 +3,7 @@ This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work! * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer) +* Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman) * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv) * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary) @@ -47,6 +48,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Sam Bozek, [@sambozek](https://github.com/sambozek) * Sasho Savkov, [@savkov](https://github.com/savkov) * Shuvanon Razik, [@shuvanon](https://github.com/shuvanon) +* Swier, [@swierh](https://github.com/swierh) * Thomas Tanon, [@Tpt](https://github.com/Tpt) * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues) * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov) From 8b9c4c5e1c80e7e3814b39a64e58a24c005b15f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:43:47 +0200 Subject: [PATCH 027/110] Add missing SP symbol to tag map, re #1052 --- spacy/language_data/tag_map.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py index ead6dd1c6..65dab9b0d 100644 --- a/spacy/language_data/tag_map.py +++ b/spacy/language_data/tag_map.py @@ -22,5 +22,6 @@ TAG_MAP = { "CCONJ": {POS: CCONJ}, # U20 "ADJ": {POS: ADJ}, "VERB": {POS: VERB}, - "PART": {POS: PART} + "PART": {POS: PART}, + 'SP': {POS: SPACE} } From 45f6961ae0f54f1e6cbb6fb59158e2ce03e27417 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:45:21 +0200 Subject: [PATCH 028/110] Add __version__ symbol in __init__.py --- spacy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/__init__.py b/spacy/__init__.py index 2308ce7e4..3afb38cfb 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,6 +5,7 @@ from . import util from .deprecated import resolve_model_name from .cli.info import info from .glossary import explain +from .about import __version__ from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja From 0ae3807d7df39b70cc45fc973b84701d9c4f9e25 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:53:48 +0200 Subject: [PATCH 029/110] Fix gaps in Lexeme API. Closes #1031 --- spacy/lexeme.pyx | 9 +++++++++ spacy/tests/regression/test_issue1031.py | 13 +++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 spacy/tests/regression/test_issue1031.py diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 05d8bddc6..dc0440486 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -159,6 +159,10 @@ cdef class Lexeme: def __get__(self): return self.c.id + property lex_id: + def __get__(self): + return self.c.id + property repvec: def __get__(self): raise AttributeError("lex.repvec has been renamed to lex.vector") @@ -173,6 +177,11 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.orth] + property text: + def __get__(self): + return self.vocab.strings[self.c.orth] + + property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x diff --git a/spacy/tests/regression/test_issue1031.py b/spacy/tests/regression/test_issue1031.py new file mode 100644 index 000000000..1ac14eb7b --- /dev/null +++ b/spacy/tests/regression/test_issue1031.py @@ -0,0 +1,13 @@ +from ...vocab import Vocab + +def test_lexeme_text(): + vocab = Vocab() + lex = vocab[u'the'] + assert lex.text == u'the' + + +def test_lexeme_lex_id(): + vocab = Vocab() + lex1 = vocab[u'the'] + lex2 = vocab[u'be'] + assert lex1.lex_id != lex2.lex_id From dfbc7e49de96c9e8980c89706d4889244d1f6e39 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 14:14:01 +0200 Subject: [PATCH 030/110] Add test for Issue #1207 --- spacy/tests/regression/test_issue1307.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 spacy/tests/regression/test_issue1307.py diff --git a/spacy/tests/regression/test_issue1307.py b/spacy/tests/regression/test_issue1307.py new file mode 100644 index 000000000..a71faebcb --- /dev/null +++ b/spacy/tests/regression/test_issue1307.py @@ -0,0 +1,25 @@ +from __future__ import unicode_literals +from ..util import get_doc +from ...vocab import Vocab +from ...en import English + + +def test_span_noun_chunks(): + vocab = Vocab(lang='en', tag_map=English.Defaults.tag_map) + words = "Employees are recruiting talented staffers from overseas .".split() + heads = [1, 1, 0, 1, -2, -1, -5] + deps = ['nsubj', 'aux', 'ROOT', 'nmod', 'dobj', 'adv', 'pobj'] + tags = ['NNS', 'VBP', 'VBG', 'JJ', 'NNS', 'IN', 'NN', '.'] + doc = get_doc(vocab, words=words, heads=heads, deps=deps, tags=tags) + doc.is_parsed = True + + noun_chunks = [np.text for np in doc.noun_chunks] + assert noun_chunks == ['Employees', 'talented staffers', 'overseas'] + + span = doc[0:4] + noun_chunks = [np.text for np in span.noun_chunks] + assert noun_chunks == ['Employees'] + + for sent in doc.sents: + noun_chunks = [np.text for np in sent.noun_chunks] + assert noun_chunks == ['Employees', 'talented staffers', 'overseas'] From d9b85675d79553b4435aef1140354161c3f5dc91 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 14:14:35 +0200 Subject: [PATCH 031/110] Rename regression test --- spacy/tests/regression/{test_issue1307.py => test_issue1207.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/tests/regression/{test_issue1307.py => test_issue1207.py} (100%) diff --git a/spacy/tests/regression/test_issue1307.py b/spacy/tests/regression/test_issue1207.py similarity index 100% rename from spacy/tests/regression/test_issue1307.py rename to spacy/tests/regression/test_issue1207.py From 9750a0128cf211dac80217eee38e41c38f2c761c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 14:14:57 +0200 Subject: [PATCH 032/110] Fix Span.noun_chunks. Closes #1207 --- spacy/tokens/span.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 09927ab4c..d8890addc 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -230,7 +230,7 @@ cdef class Span: # so it's okay once we have the Span objects. See Issue #375 spans = [] for start, end, label in self.doc.noun_chunks_iterator(self): - spans.append(Span(self, start, end, label=label)) + spans.append(Span(self.doc, start, end, label=label)) for span in spans: yield span From 23a55b40ca8af1af588b6cbf5504b8d87e3b91d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 14:15:25 +0200 Subject: [PATCH 033/110] Default to English noun chunks iterator if no lang set --- spacy/syntax/iterators.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index b0d1c78ca..0fe724622 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -117,4 +117,5 @@ def es_noun_chunks(obj): token = next_token(token) -CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks} +CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks, + None: english_noun_chunks, '': english_noun_chunks} From e3f23f9d910b0fa0e5c71b5b4c5c2a243fe66e60 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 14:57:51 +0200 Subject: [PATCH 034/110] Use latest available version in examples --- website/docs/usage/models.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 30863720c..42a3c0bbf 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -67,7 +67,7 @@ p python -m spacy download en_core_web_md # download exact model version (doesn't create shortcut link) - python -m spacy download en_core_web_md-1.2.0 --direct + python -m spacy download en_core_web_md-1.2.1 --direct p | The download command will #[+a("#download-pip") install the model] via @@ -96,10 +96,10 @@ p +code(false, "bash"). # with external URL - pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.0/en_core_web_md-1.2.0.tar.gz + pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.1/en_core_web_md-1.2.1.tar.gz # with local file - pip install /Users/you/en_core_web_md-1.2.0.tar.gz + pip install /Users/you/en_core_web_md-1.2.1.tar.gz p | By default, this will install the model into your #[code site-packages] From b22b18a0199ae2856f8f6923fb0db1cebe74dbb5 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 15:02:15 +0200 Subject: [PATCH 035/110] Add notes on spacy.explain() to annotation docs --- website/docs/api/annotation.jade | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 8c6b8fb10..30080dfd9 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -38,6 +38,11 @@ p +h(2, "pos-tagging") Part-of-speech Tagging ++infobox("Tip: Understanding tags") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of a tag. For example, + | #[code spacy.explain("RB")] will return "adverb". + include _annotation/_pos-tags +h(2, "lemmatization") Lemmatization @@ -65,10 +70,20 @@ p +h(2, "dependency-parsing") Syntactic Dependency Parsing ++infobox("Tip: Understanding labels") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of a label. For example, + | #[code spacy.explain("prt")] will return "particle". + include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition ++infobox("Tip: Understanding entity types") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". + include _annotation/_named-entities +h(2, "json-input") JSON input format for training From 96df9c7154b7967a145423200be62fa245039e8b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 22 Jul 2017 15:05:46 +0200 Subject: [PATCH 036/110] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index bfdbf5c4f..995f6901f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -58,3 +58,4 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) * Yasuaki Uechi, [@uetchy](https://github.com/uetchy) * Yubing Dong, [@tomtung](https://github.com/tomtung) +* Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter) From 4b2e5e59eda15c5f60710acbfb8624f748a169fc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 15:06:50 +0200 Subject: [PATCH 037/110] Add flush_cache method to tokenizer, to fix #1061 The tokenizer caches output for common chunks, for efficiency. This cache is be invalidated when the tokenizer rules change, e.g. when a new special-case rule is introduced. That's what was causing #1061. When the cache is flushed, we free the intermediate token chunks. I *think* this is safe --- but if we start getting segfaults, this patch is to blame. The resolution would be to simply not free those bits of memory. They'll be freed when the tokenizer exits anyway. --- spacy/tests/regression/test_issue1061.py | 27 ++++++++++++++ spacy/tokenizer.pyx | 46 +++++++++++++++++++++--- 2 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/regression/test_issue1061.py diff --git a/spacy/tests/regression/test_issue1061.py b/spacy/tests/regression/test_issue1061.py new file mode 100644 index 000000000..821ca2bfc --- /dev/null +++ b/spacy/tests/regression/test_issue1061.py @@ -0,0 +1,27 @@ +from __future__ import unicode_literals + +from ...symbols import ORTH + +from ...vocab import Vocab +from ...en import English + + +def test_issue1061(): + '''Test special-case works after tokenizing. Was caching problem.''' + text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.' + tokenizer = English.Defaults.create_tokenizer() + doc = tokenizer(text) + assert 'MATH' in [w.text for w in doc] + assert '_MATH_' not in [w.text for w in doc] + + tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}]) + doc = tokenizer(text) + assert '_MATH_' in [w.text for w in doc] + assert 'MATH' not in [w.text for w in doc] + + # For sanity, check it works when pipeline is clean. + tokenizer = English.Defaults.create_tokenizer() + tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}]) + doc = tokenizer(text) + assert '_MATH_' in [w.text for w in doc] + assert 'MATH' not in [w.text for w in doc] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index c094bea0d..276f0ef20 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -186,7 +186,13 @@ cdef class Tokenizer: cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: - return False + # See 'flush_cache' below for hand-wringing about + # how to handle this. + cached = <_Cached*>self._specials.get(key) + if cached == NULL: + return False + else: + self._cache.set(key, cached) cdef int i if cached.is_lex: for i in range(cached.length): @@ -201,9 +207,15 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + special_case = self._specials.get(orig_key) + if special_case is not NULL: + for i in range(special_case.length): + tokens.push_back(&special_case.data.tokens[i], False) + self._cache.set(orig_key, special_case) + else: + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -389,5 +401,29 @@ cdef class Tokenizer: cached.data.tokens = self.vocab.make_fused_token(substrings) key = hash_string(string) self._specials.set(key, cached) - self._cache.set(key, cached) self._rules[string] = substrings + # After changing the tokenization rules, the previous tokenization + # may be stale. + self.flush_cache() + + def flush_cache(self): + '''Flush the tokenizer's cache. May not free memory immediately. + + This is called automatically after `add_special_case`, but if you + write to the prefix or suffix functions, you'll have to call this + yourself. You may also need to flush the tokenizer cache after + changing the lex_attr_getter functions. + ''' + cdef hash_t key + for key in self._cache.keys(): + special_case = self._specials.get(key) + # Don't free data shared with special-case rules + if special_case is not NULL: + continue + cached = <_Cached*>self._cache.get(key) + if cached is not NULL: + self.mem.free(cached) + self._cache = PreshMap(1000) + # We could here readd the data from specials --- but if we loop over + # a bunch of special-cases, we'll get a quadratic behaviour. The extra + # lookup isn't so bad? Tough to tell. From d7560047c5038fb4bf8a3f3a52b7a02ab6e88b25 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 15:24:31 +0200 Subject: [PATCH 038/110] Fix version --- website/docs/api/annotation.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 30080dfd9..d4b01a819 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -39,7 +39,7 @@ p +h(2, "pos-tagging") Part-of-speech Tagging +infobox("Tip: Understanding tags") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the | description for the string representation of a tag. For example, | #[code spacy.explain("RB")] will return "adverb". @@ -71,7 +71,7 @@ p +h(2, "dependency-parsing") Syntactic Dependency Parsing +infobox("Tip: Understanding labels") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the | description for the string representation of a label. For example, | #[code spacy.explain("prt")] will return "particle". @@ -80,7 +80,7 @@ include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition +infobox("Tip: Understanding entity types") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the | description for the string representation of an entity label. For example, | #[code spacy.explain("LANGUAGE")] will return "any named language". From de25bad036c7ddcf30181e71c4c1750ff6b93c18 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 15:29:10 +0200 Subject: [PATCH 039/110] Use lower min version for requests dependency (fixes #1137) Ensure compatibility with docker-compose and other packages --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 20c587841..fe273ee53 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ six html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 -requests>=2.13.0,<3.0.0 +requests>=2.11.0,<3.0.0 regex==2017.4.5 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 From 7c4bf9994d23f5b07ebed24034b8d8eee2eaa6f6 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 15:40:12 +0200 Subject: [PATCH 040/110] Add note on requirements and preventing model re-downloads (closes #1143) --- website/docs/usage/models.jade | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 42a3c0bbf..2d0f83663 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -198,6 +198,37 @@ p nlp = en_core_web_md.load() doc = nlp(u'This is a sentence.') ++h(3, "models-download") Downloading and requiring model dependencies + +p + | spaCy's built-in #[+api("cli#download") #[code download]] command + | is mostly intended as a convenient, interactive wrapper. It performs + | compatibility checks and prints detailed error messages and warnings. + | However, if you're downloading models as part of an automated build + | process, this only adds an unecessary layer of complexity. If you know + | which models your application needs, you should be specifying them directly. + ++aside("Prevent re-downloading models") + | If you're installing a model from a URL, pip will usually re-download and + | re-install the package, even if you already have a matching + | version installed. To prevent this, simply add #[code #egg=] and the + | package name after the URL, e.g. #[code #egg=en_core_web_sm] or + | #[code #egg=en_core_web_sm-1.2.0]. This tells pip which package and version + | you're trying to download, and will skip the package if a matching + | installation is found. + +p + | Because all models are valid Python packages, you can add them to your + | application's #[code requirements.txt]. If you're running your own + | internal PyPi installation, you can simply upload the models there. pip's + | #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format] + | supports both package names to download via a PyPi server, as well as direct + | URLs. + ++code("requirements.txt", "text"). + spacy>=1.8.0,<2.0.0 + -e #{gh("spacy-models")}/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz#egg=en_core_web_sm-1.2.0 + +h(2, "own-models") Using your own models p From 796b2f4c1b49401f7cb490df174fe32f0186bc56 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 15:42:38 +0200 Subject: [PATCH 041/110] Remove print statements in tests --- spacy/tests/regression/test_issue693.py | 2 -- spacy/tests/regression/test_issue995.py | 1 - 2 files changed, 3 deletions(-) diff --git a/spacy/tests/regression/test_issue693.py b/spacy/tests/regression/test_issue693.py index e4d907716..5deeb3215 100644 --- a/spacy/tests/regression/test_issue693.py +++ b/spacy/tests/regression/test_issue693.py @@ -14,7 +14,5 @@ def test_issue693(EN): doc2 = EN(text2) chunks1 = [chunk for chunk in doc1.noun_chunks] chunks2 = [chunk for chunk in doc2.noun_chunks] - for word in doc1: - print(word.text, word.dep_, word.head.text) assert len(chunks1) == 2 assert len(chunks2) == 2 diff --git a/spacy/tests/regression/test_issue995.py b/spacy/tests/regression/test_issue995.py index 633e96fb5..108b434a2 100644 --- a/spacy/tests/regression/test_issue995.py +++ b/spacy/tests/regression/test_issue995.py @@ -15,7 +15,6 @@ def test_issue955(doc): '''Test that we don't have any nested noun chunks''' seen_tokens = set() for np in doc.noun_chunks: - print(np.text, np.root.text, np.root.dep_, np.root.tag_) for word in np: key = (word.i, word.text) assert key not in seen_tokens From d51d55bba673cbe35784589825ff88fd33bb1f73 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 15:43:16 +0200 Subject: [PATCH 042/110] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 8c0e0afd3..d34c6f948 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.8.2' +__version__ = '1.9.0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' From 78fcf56dd5ce0beeebdcd58c0082f78d751ba206 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 15:57:58 +0200 Subject: [PATCH 043/110] Update version pin for regex library --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fe273ee53..6f7d067a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 requests>=2.11.0,<3.0.0 -regex==2017.4.5 +regex==2017.7.11 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 pip>=9.0.0,<10.0.0 diff --git a/setup.py b/setup.py index 89aaf8eba..61bd6b6bb 100755 --- a/setup.py +++ b/setup.py @@ -203,7 +203,7 @@ def setup_package(): 'ujson>=1.35', 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', - 'regex==2017.4.5', + 'regex==2017.7.11', 'ftfy>=4.4.2,<5.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', From 5494605689758238e92703fa759a2f56cbb00598 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 16:09:50 +0200 Subject: [PATCH 044/110] Fiddle with regex pin --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6f7d067a5..9d6f34133 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 requests>=2.11.0,<3.0.0 -regex==2017.7.11 +regex>=2017.4.1,<2017.12.1 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 pip>=9.0.0,<10.0.0 diff --git a/setup.py b/setup.py index 61bd6b6bb..1b127962b 100755 --- a/setup.py +++ b/setup.py @@ -203,7 +203,7 @@ def setup_package(): 'ujson>=1.35', 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', - 'regex==2017.7.11', + 'regex>=2017.4.1,<2017.12.1', 'ftfy>=4.4.2,<5.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', From 570964e67f0c7a12e64551cd4b71dca3c40b6ad8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 22 Jul 2017 16:20:19 +0200 Subject: [PATCH 045/110] Update README.rst --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 0f3efc146..9d52a6c9d 100644 --- a/README.rst +++ b/README.rst @@ -63,11 +63,12 @@ MIT license. 💬 Where to ask questions ========================== +Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. + ====================== === **Bug reports** `GitHub issue tracker`_ **Usage questions** `StackOverflow`_, `Gitter chat`_, `Reddit user group`_ **General discussion** `Gitter chat`_, `Reddit user group`_ -**Commercial support** contact@explosion.ai ====================== === .. _GitHub issue tracker: https://github.com/explosion/spaCy/issues From e349271506b66e4257e8c69e02c664bbb0442fda Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 18:29:30 +0200 Subject: [PATCH 046/110] Increment version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index cb476541a..37a0b54dd 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,7 +12,7 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.8", + "SPACY_VERSION": "1.9", "LATEST_NEWS": { "url": "/docs/usage/models", "title": "The first official Spanish model is here!" From 864cefd3b267e08a843703687fcd0b2587c8d080 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 18:29:55 +0200 Subject: [PATCH 047/110] Update README.rst --- README.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9d52a6c9d..4efd5b1de 100644 --- a/README.rst +++ b/README.rst @@ -9,14 +9,14 @@ Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software, released under the MIT license. -⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes here. `_ +⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes. `_ -💫 **Version 1.8 out now!** `Read the release notes here. `_ +💫 **Version 1.9 out now!** `Read the release notes here. `_ .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy :alt: Travis Build Status - + .. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square :target: https://ci.appveyor.com/project/explosion/spacy :alt: Appveyor Build Status @@ -326,6 +326,7 @@ and ``--model`` are optional and enable additional tests: =========== ============== =========== Version Date Description =========== ============== =========== +`v1.9.0`_ ``2017-07-22`` Spanish model, alpha support for Norwegian & Japanese, and bug fixes `v1.8.2`_ ``2017-04-26`` French model and small improvements `v1.8.1`_ ``2017-04-23`` Saving, loading and training bug fixes `v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading @@ -359,6 +360,7 @@ Version Date Description `v0.93`_ ``2015-09-22`` Bug fixes to word vectors =========== ============== =========== +.. _v1.9.0: https://github.com/explosion/spaCy/releases/tag/v1.9.0 .. _v1.8.2: https://github.com/explosion/spaCy/releases/tag/v1.8.2 .. _v1.8.1: https://github.com/explosion/spaCy/releases/tag/v1.8.1 .. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 From 7e98a3613c4934709f3358594a928f476e2fa8f2 Mon Sep 17 00:00:00 2001 From: Gideon Dresdner Date: Sun, 6 Aug 2017 13:21:45 +0200 Subject: [PATCH 048/110] improve pipe, tee, izip explanation Use an example from an old issue https://github.com/explosion/spaCy/issues/172#issuecomment-183963403. --- website/docs/usage/processing-text.jade | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade index 4bd6132d2..600654f65 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/processing-text.jade @@ -98,7 +98,8 @@ p | important metadata, e.g. a JSON document. To pair up the metadata | with the processed #[code Doc] object, you should use the tee | function to split the generator in two, and then #[code izip] the - | extra stream to the document stream. + | extra stream to the document stream. Here's an + | #[a(href="https://github.com/explosion/spaCy/issues/172#issuecomment-183963403")= "example"] +h(2, "own-annotations") Bringing your own annotations From d3b03f05441de59cfd45b7414d4aab6fd1b32242 Mon Sep 17 00:00:00 2001 From: Delirious Lettuce Date: Sun, 6 Aug 2017 21:31:39 -0600 Subject: [PATCH 049/110] Fix typos: * `auxillary` -> `auxiliary` * `consistute` -> `constitute` * `earlist` -> `earliest` * `prefered` -> `preferred` * `direcory` -> `directory` * `reuseable` -> `reusable` * `idiosyncracies` -> `idiosyncrasies` * `enviroment` -> `environment` * `unecessary` -> `unnecessary` * `yesteday` -> `yesterday` * `resouces` -> `resources` --- spacy/glossary.py | 4 ++-- website/docs/api/_annotation/_pos-tags.jade | 4 ++-- website/docs/api/features.jade | 2 +- website/docs/api/span.jade | 2 +- website/docs/usage/adding-languages.jade | 4 ++-- website/docs/usage/customizing-tokenizer.jade | 2 +- website/docs/usage/index.jade | 2 +- website/docs/usage/models.jade | 2 +- website/docs/usage/pos-tagging.jade | 2 +- website/docs/usage/saving-loading.jade | 2 +- 10 files changed, 13 insertions(+), 13 deletions(-) diff --git a/spacy/glossary.py b/spacy/glossary.py index 4df5264a6..ed1c22c21 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -60,7 +60,7 @@ GLOSSARY = { 'JJR': 'adjective, comparative', 'JJS': 'adjective, superlative', 'LS': 'list item marker', - 'MD': 'verb, modal auxillary', + 'MD': 'verb, modal auxiliary', 'NIL': 'missing tag', 'NN': 'noun, singular or mass', 'NNP': 'noun, proper singular', @@ -91,7 +91,7 @@ GLOSSARY = { 'NFP': 'superfluous punctuation', 'GW': 'additional word in multi-word expression', 'XX': 'unknown', - 'BES': 'auxillary "be"', + 'BES': 'auxiliary "be"', 'HVS': 'forms of "have"', diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade index ea3a225bf..51db4f4e2 100644 --- a/website/docs/api/_annotation/_pos-tags.jade +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -21,7 +21,7 @@ p +pos-row("$", "SYM", "SymType=currency", "symbol, currency") +pos-row("ADD", "X", "", "email") +pos-row("AFX", "ADJ", "Hyph=yes", "affix") - +pos-row("BES", "VERB", "", 'auxillary "be"') + +pos-row("BES", "VERB", "", 'auxiliary "be"') +pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating") +pos-row("CD", "NUM", "NumType=card", "cardinal number") +pos-row("DT", "DET", "determiner") @@ -35,7 +35,7 @@ p +pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative") +pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative") +pos-row("LS", "PUNCT", "NumType=ord", "list item marker") - +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary") + +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary") +pos-row("NFP", "PUNCT", "", "superfluous punctuation") +pos-row("NIL", "", "", "missing tag") +pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass") diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade index 018790145..21481cf65 100644 --- a/website/docs/api/features.jade +++ b/website/docs/api/features.jade @@ -18,7 +18,7 @@ p | consisting of the words to be processed. p - | Each state consists of the words on the stack (if any), which consistute + | Each state consists of the words on the stack (if any), which constitute | the current entity being constructed. We also have the current word, and | the two subsequent words. Finally, we also have the entities previously | built. diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 770ee3e9b..d2d3d0f27 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -222,7 +222,7 @@ p The sentence span that this span is a part of. p | The token within the span that's highest in the parse tree. If there's a - | tie, the earlist is prefered. + | tie, the earliest is preferred. +table(["Name", "Type", "Description"]) +footrow diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 30c4486b0..7d893b4eb 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -28,7 +28,7 @@ p | #[a(href="#word-vectors") word vectors]. +item - | #[strong Set up] a #[a(href="#model-directory") model direcory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser]. + | #[strong Set up] a #[a(href="#model-directory") model directory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser]. p | For some languages, you may also want to develop a solution for @@ -303,7 +303,7 @@ p p | Because languages can vary in quite arbitrary ways, spaCy avoids | organising the language data into an explicit inheritance hierarchy. - | Instead, reuseable functions and data are collected as atomic pieces in + | Instead, reusable functions and data are collected as atomic pieces in | the #[code spacy.language_data] package. +aside-code("Example"). diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index 354a56c22..ca5be9ef1 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -21,7 +21,7 @@ p +h(2, "special-cases") Adding special case tokenization rules p - | Most domains have at least some idiosyncracies that require custom + | Most domains have at least some idiosyncrasies that require custom | tokenization rules. Here's how to add a special case rule to an existing | #[+api("tokenizer") #[code Tokenizer]] instance: diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 9ad2fde5f..092c996b3 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -87,7 +87,7 @@ p | The other way to install spaCy is to clone its | #[+a(gh("spaCy")) GitHub repository] and build it from source. That is | the common way if you want to make changes to the code base. You'll need to - | make sure that you have a development enviroment consisting of a Python + | make sure that you have a development environment consisting of a Python | distribution including header files, a compiler, | #[+a("https://pip.pypa.io/en/latest/installing/") pip], | #[+a("https://virtualenv.pypa.io/") virtualenv] and diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 2d0f83663..4951ea211 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -205,7 +205,7 @@ p | is mostly intended as a convenient, interactive wrapper. It performs | compatibility checks and prints detailed error messages and warnings. | However, if you're downloading models as part of an automated build - | process, this only adds an unecessary layer of complexity. If you know + | process, this only adds an unnecessary layer of complexity. If you know | which models your application needs, you should be specifying them directly. +aside("Prevent re-downloading models") diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index cded00b6c..3f22ab43f 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -50,7 +50,7 @@ p +cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres] +row - +cell I read the paper yesteday + +cell I read the paper yesterday +cell read +cell read +cell verb diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 8978cce7a..56b218c29 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -58,7 +58,7 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+a(gh("spacy-dev-resources", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The From 6e9e686568ab1f70d0b517e0d5f2bcbb894eb17a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 8 Aug 2017 01:27:15 +0900 Subject: [PATCH 050/110] Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM --- spacy/ja/__init__.py | 92 +++++++++++++++++++++++++++++---- spacy/ja/tag_map.py | 97 +++++++++++++++++++++++++++++------ spacy/tests/conftest.py | 5 +- spacy/tests/ja/test_tagger.py | 10 ++++ 4 files changed, 177 insertions(+), 27 deletions(-) create mode 100644 spacy/tests/ja/test_tagger.py diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 1c85ded95..5f49f0b1b 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -5,37 +5,111 @@ from os import path from ..language import Language, BaseDefaults from ..tokenizer import Tokenizer +from ..tagger import Tagger from ..attrs import LANG from ..tokens import Doc from .language_data import * +import re +from collections import namedtuple + +ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) + class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: - from janome.tokenizer import Tokenizer + import MeCab except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome library: " - "https://github.com/mocobeta/janome") - self.tokenizer = Tokenizer() + raise ImportError("The Japanese tokenizer requires the MeCab library: " + "https://github.com/SamuraiT/mecab-python3") + self.tokenizer = MeCab.Tagger() def __call__(self, text): - words = [x.surface for x in self.tokenizer.tokenize(text)] + words = [x.surface for x in detailed_tokens(self.tokenizer, text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) +def resolve_pos(token): + """If necessary, add a field to the POS tag for UD mapping. + + Under Universal Dependencies, sometimes the same Unidic POS tag can + be mapped differently depending on the literal token or its context + in the sentence. This function adds information to the POS tag to + resolve ambiguous mappings. + """ + + # NOTE: This is a first take. The rules here are crude approximations. + # For many of these, full dependencies are needed to properly resolve + # PoS mappings. + + if token.part_of_speech == '連体詞,*,*,*': + # determiner-likes get DET, otherwise ADJ + if re.match('^[こそあど此其彼]の', token.surface): + return token.part_of_speech + ',DET' + else: + return token.part_of_speech + ',ADJ' + return token.part_of_speech + +def detailed_tokens(tokenizer, text): + """Format Mecab output into a nice data structure, based on Janome.""" + + node = tokenizer.parseToNode(text) + node = node.next # first node is beginning of sentence and empty, skip it + words = [] + while node.posid != 0: + parts = node.feature.split(',') + pos = ','.join(parts[0:4]) + reading = parts[6] + base = parts[7] + surface = parts[8] + + words.append( ShortUnitWord(surface, base, pos) ) + node = node.next + return words + +class JapaneseTagger(object): + def __init__(self, vocab): + try: + import MeCab + except ImportError: + raise ImportError("The Japanese tagger requires the MeCab library: " + "https://github.com/SamuraiT/mecab-python3") + + self.tagger = Tagger(vocab) + self.tokenizer = MeCab.Tagger() + + def __call__(self, tokens): + # two parts to this: + # 1. get raw JP tags + # 2. add features to tags as necessary for UD + + # TODO: if the text has been tokenized, this info is already available + # How to set the data when tokenizing or save it for the tagger to find? + + dtokens = detailed_tokens(self.tokenizer, tokens.text) + rawtags = list(map(resolve_pos, dtokens)) + self.tagger.tag_from_strings(tokens, rawtags) + class JapaneseDefaults(BaseDefaults): + tag_map = TAG_MAP + @classmethod def create_tokenizer(cls, nlp=None): return JapaneseTokenizer(cls, nlp) + @classmethod + def create_tagger(cls, tokenizer): + return JapaneseTagger(tokenizer.vocab) + class Japanese(Language): lang = 'ja' Defaults = JapaneseDefaults def make_doc(self, text): - words = self.tokenizer(text) - return Doc(self.vocab, words=words, spaces=[False]*len(words)) - - + words = [str(t) for t in self.tokenizer(text)] + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + tagger = JapaneseDefaults.create_tagger(self.tokenizer) + tagger(doc) + return doc diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index f5b6b5040..609739c2f 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -3,22 +3,85 @@ from __future__ import unicode_literals from ..symbols import * - TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB} + # Explanation of Unidic tags: + # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf + + # Universal Dependencies Mapping: + # http://universaldependencies.org/ja/overview/morphology.html + # http://universaldependencies.org/ja/pos/all.html + + "記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ + "記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math + + "感動詞,フィラー,*,*": {POS: INTJ}, + "感動詞,一般,*,*": {POS: INTJ}, + + # this is specifically for unicode full-width space + "空白,*,*,*": {POS: X}, + + "形状詞,一般,*,*":{POS: ADJ}, + "形状詞,タリ,*,*":{POS: ADJ}, + "形状詞,助動詞語幹,*,*":{POS: ADJ}, + "形容詞,一般,*,*":{POS: ADJ}, + "形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise + + "助詞,格助詞,*,*":{POS: ADP}, + "助詞,係助詞,*,*":{POS: ADP}, + "助詞,終助詞,*,*":{POS: PART}, + "助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い + "助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て + "助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb + "助動詞,*,*,*":{POS: AUX}, + "接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement + + "接頭辞,*,*,*":{POS: NOUN}, + "接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック + "接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい + "接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ + "接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* + "接尾辞,名詞的,一般,*":{POS: NOUN}, + "接尾辞,名詞的,助数詞,*":{POS: NOUN}, + "接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ + + "代名詞,*,*,*":{POS: PRON}, + "動詞,一般,*,*":{POS: VERB}, + "動詞,非自立可能,*,*":{POS: AUX}, # XXX VERB if alone, AUX otherwise + "動詞,非自立可能,*,*,AUX":{POS: AUX}, + "動詞,非自立可能,*,*,VERB":{POS: VERB}, + "副詞,*,*,*":{POS: ADV}, + + "補助記号,AA,一般,*":{POS: SYM}, # text art + "補助記号,AA,顔文字,*":{POS: SYM}, # kaomoji + "補助記号,一般,*,*":{POS: SYM}, + "補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket + "補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket + "補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker + "補助記号,読点,*,*":{POS: PUNCT}, # comma + + "名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun + "名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name + "名詞,固有名詞,人名,姓":{POS: PROPN}, # surname + "名詞,固有名詞,人名,名":{POS: PROPN}, # first name + "名詞,固有名詞,地名,一般":{POS: PROPN}, # place name + "名詞,固有名詞,地名,国":{POS: PROPN}, # country name + + "名詞,助動詞語幹,*,*":{POS: AUX}, + "名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals + + "名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + "名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN}, + "名詞,普通名詞,サ変可能,*,VERB":{POS: VERB}, + + "名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手 + "名詞,普通名詞,一般,*":{POS: NOUN}, + "名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2 + "名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN}, + "名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ}, + "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit + "名詞,普通名詞,副詞可能,*":{POS: NOUN}, + + "連体詞,*,*,*":{POS: ADJ}, # XXX note この、その etc. should be DET + "連体詞,*,*,*,ADJ":{POS: ADJ}, + "連体詞,*,*,*,DET":{POS: DET}, } diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6e00b1513..52b9bdd57 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -79,9 +79,12 @@ def fi_tokenizer(): @pytest.fixture def ja_tokenizer(): - janome = pytest.importorskip("janome") + pytest.importorskip("MeCab") return Japanese.Defaults.create_tokenizer() +@pytest.fixture +def japanese(): + return Japanese() @pytest.fixture def sv_tokenizer(): diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py new file mode 100644 index 000000000..43259fb49 --- /dev/null +++ b/spacy/tests/ja/test_tagger.py @@ -0,0 +1,10 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +def test_japanese_tagger(japanese): + doc = japanese.make_doc("このファイルには小さなテストが入っているよ") + # note these both have the same raw tag, '連体詞,*,*,*' + assert doc[0].pos_ == "DET" + assert doc[4].pos_ == "ADJ" From e3738aba0dc562cdd87133fdfd58a9741b0c08f2 Mon Sep 17 00:00:00 2001 From: Kevin Marsh Date: Tue, 15 Aug 2017 21:50:09 +0100 Subject: [PATCH 051/110] Fix broken tutorial link on website --- website/docs/usage/_data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 703a185d6..c2ce271aa 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -313,7 +313,7 @@ "author": "Clark Grubb" }, "A very (very) short primer on spacy.io": { - "url": "http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html", + "url": "https://web.archive.org/web/20161219095416/http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html", "author": "Nimrod Milo " } }, From 234a8a75917aa01c48e06ed4767d6f47cdfead22 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 21 Aug 2017 00:21:45 +0900 Subject: [PATCH 052/110] =?UTF-8?q?Change=20default=20tag=20for=20?= =?UTF-8?q?=E5=8B=95=E8=A9=9E,=E9=9D=9E=E8=87=AA=E7=AB=8B=E5=8F=AF?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Example of this is いる in these sentences: 彼はそこにいる。# should be VERB 彼は底に立っている。# should be AUX Unclear which case is more numerous - need to check a large corpus - but in keeping with the other ambiguous tags, this is mapped to the "dominant" or first part of the tag. -POLM --- spacy/ja/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index 609739c2f..8436f07ff 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -46,7 +46,7 @@ TAG_MAP = { "代名詞,*,*,*":{POS: PRON}, "動詞,一般,*,*":{POS: VERB}, - "動詞,非自立可能,*,*":{POS: AUX}, # XXX VERB if alone, AUX otherwise + "動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise "動詞,非自立可能,*,*,AUX":{POS: AUX}, "動詞,非自立可能,*,*,VERB":{POS: VERB}, "副詞,*,*,*":{POS: ADV}, From c5c3f4c7d9a9c715110040d1e75e08d8a7b8dc20 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 21 Aug 2017 16:08:40 +0200 Subject: [PATCH 053/110] Use more generous .env ignore rule --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2209f5b4a..84ced41f8 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ Profile.prof .python-version __pycache__/ *.py[cod] +.env*/ .env/ .env2/ .env3/ From edc596d9a77cf0281b3641297fd5abd62a74edf2 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 21 Aug 2017 16:11:36 +0200 Subject: [PATCH 054/110] Add missing tokenizer exceptions (resolves #1281) --- spacy/en/tokenizer_exceptions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index d9aa01734..29447314a 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -276,7 +276,10 @@ for verb_data in [ {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, {ORTH: "was", LEMMA: "be"}, - {ORTH: "were", LEMMA: "be"} + {ORTH: "were", LEMMA: "be"}, + {ORTH: "have"}, + {ORTH: "has", LEMMA: "have"}, + {ORTH: "dare"} ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() From dcff10abe98c844f2f66ff22835c9eb8ea8e7138 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 21 Aug 2017 16:11:47 +0200 Subject: [PATCH 055/110] Add regression test for #1281 --- spacy/tests/regression/test_issue1281.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 spacy/tests/regression/test_issue1281.py diff --git a/spacy/tests/regression/test_issue1281.py b/spacy/tests/regression/test_issue1281.py new file mode 100644 index 000000000..17307b1d6 --- /dev/null +++ b/spacy/tests/regression/test_issue1281.py @@ -0,0 +1,13 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', [ + "She hasn't done the housework.", + "I haven't done it before.", + "you daren't do that"]) +def test_issue1281(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[2].text == "n't" From c435f748d743b1ee407c02c14223679769fa52b2 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:01:28 +0900 Subject: [PATCH 056/110] Put Mecab import in utility function --- spacy/ja/__init__.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 5f49f0b1b..c82591f58 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -16,14 +16,21 @@ from collections import namedtuple ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) +def try_mecab_import(): + """Mecab is required for Japanese support, so check for it. + + It it's not available blow up and explain how to fix it.""" + try: + import MeCab + return MeCab + except ImportError: + raise ImportError("Japanese support requires MeCab: " + "https://github.com/SamuraiT/mecab-python3") + class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - try: - import MeCab - except ImportError: - raise ImportError("The Japanese tokenizer requires the MeCab library: " - "https://github.com/SamuraiT/mecab-python3") + MeCab = try_mecab_import() self.tokenizer = MeCab.Tagger() def __call__(self, text): @@ -70,12 +77,7 @@ def detailed_tokens(tokenizer, text): class JapaneseTagger(object): def __init__(self, vocab): - try: - import MeCab - except ImportError: - raise ImportError("The Japanese tagger requires the MeCab library: " - "https://github.com/SamuraiT/mecab-python3") - + MeCab = try_mecab_import() self.tagger = Tagger(vocab) self.tokenizer = MeCab.Tagger() From 53e17296e98ba8db1b9b99fec0a39aaa56d12e5c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:01:49 +0900 Subject: [PATCH 057/110] Fix pronoun handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missed this case earlier. 連体詞 have three classes for UD purposes: - その -> DET - それ -> PRON - 同じ -> ADJ -POLM --- spacy/ja/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index c82591f58..8cd48ad84 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -51,9 +51,10 @@ def resolve_pos(token): # PoS mappings. if token.part_of_speech == '連体詞,*,*,*': - # determiner-likes get DET, otherwise ADJ if re.match('^[こそあど此其彼]の', token.surface): return token.part_of_speech + ',DET' + if re.match('^[こそあど此其彼]', token.surface): + return token.part_of_speech + ',PRON' else: return token.part_of_speech + ',ADJ' return token.part_of_speech From adfd98731655cc3f351e0042353ea850ef7d23c2 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:02:55 +0900 Subject: [PATCH 058/110] Update the TAG_MAP --- spacy/ja/__init__.py | 3 --- spacy/ja/tag_map.py | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 8cd48ad84..dfd0bca5b 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -87,9 +87,6 @@ class JapaneseTagger(object): # 1. get raw JP tags # 2. add features to tags as necessary for UD - # TODO: if the text has been tokenized, this info is already available - # How to set the data when tokenizing or save it for the tagger to find? - dtokens = detailed_tokens(self.tokenizer, tokens.text) rawtags = list(map(resolve_pos, dtokens)) self.tagger.tag_from_strings(tokens, rawtags) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index 8436f07ff..191865ed2 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -81,7 +81,8 @@ TAG_MAP = { "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit "名詞,普通名詞,副詞可能,*":{POS: NOUN}, - "連体詞,*,*,*":{POS: ADJ}, # XXX note この、その etc. should be DET + "連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token "連体詞,*,*,*,ADJ":{POS: ADJ}, + "連体詞,*,*,*,PRON":{POS: PRON}, "連体詞,*,*,*,DET":{POS: DET}, } From bcf2b9b4f5e12951394bbc2e77daf5a1763ec9e5 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:03:11 +0900 Subject: [PATCH 059/110] Update tagger & tokenizer tests Tagger is now parametrized and has two sentences with more tag coverage. The tokenizer tests are updated to reflect differences in tokenization between IPAdic and Unidic. -POLM --- spacy/tests/ja/test_tagger.py | 33 +++++++++++++++++++++++++++----- spacy/tests/ja/test_tokenizer.py | 4 ++-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py index 43259fb49..629cc795f 100644 --- a/spacy/tests/ja/test_tagger.py +++ b/spacy/tests/ja/test_tagger.py @@ -3,8 +3,31 @@ from __future__ import unicode_literals import pytest -def test_japanese_tagger(japanese): - doc = japanese.make_doc("このファイルには小さなテストが入っているよ") - # note these both have the same raw tag, '連体詞,*,*,*' - assert doc[0].pos_ == "DET" - assert doc[4].pos_ == "ADJ" +TAGGER_TESTS = [ + ('あれならそこにあるよ', + (('代名詞,*,*,*', 'PRON'), + ('助動詞,*,*,*', 'AUX'), + ('代名詞,*,*,*', 'PRON'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助詞,終助詞,*,*', 'PART'))), + ('このファイルには小さなテストが入っているよ', + (('連体詞,*,*,*,DET', 'DET'), + ('名詞,普通名詞,サ変可能,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('助詞,係助詞,*,*', 'ADP'), + ('連体詞,*,*,*,ADJ', 'ADJ'), + ('名詞,普通名詞,サ変可能,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,一般,*,*', 'VERB'), + ('助詞,接続助詞,*,*', 'SCONJ'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助詞,終助詞,*,*', 'PART'))) +] + +@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS) +def test_japanese_tagger(japanese, text, expected_tags): + tokens = japanese.make_doc(text) + assert len(tokens) == len(expected_tags) + for token, res in zip(tokens, expected_tags): + assert token.tag_ == res[0] and token.pos_ == res[1] diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py index 58700b353..17411aee2 100644 --- a/spacy/tests/ja/test_tokenizer.py +++ b/spacy/tests/ja/test_tokenizer.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import pytest TOKENIZER_TESTS = [ - ("日本語だよ", ['日本語', 'だ', 'よ']), + ("日本語だよ", ['日本', '語', 'だ', 'よ']), ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']), ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']), - ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']), + ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']), ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']) ] From 95050201ce095e2328be383beec3025a5e64fb0a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 21:30:59 +0900 Subject: [PATCH 060/110] Add importorskip for Japanese fixture --- spacy/tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 52b9bdd57..5fad6e429 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -84,6 +84,7 @@ def ja_tokenizer(): @pytest.fixture def japanese(): + pytest.importorskip("MeCab") return Japanese() @pytest.fixture From 884ba168a88699bedecf55888b670cbf2040a539 Mon Sep 17 00:00:00 2001 From: Jeffrey Gerard Date: Wed, 23 Aug 2017 21:18:53 -0700 Subject: [PATCH 061/110] Capture more noun chunks --- spacy/syntax/iterators.pyx | 2 +- spacy/tests/parser/test_noun_chunks.py | 30 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index 0fe724622..14dba5f9b 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -9,7 +9,7 @@ def english_noun_chunks(obj): Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos', 'attr', 'ROOT'] doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings[label] for label in labels] diff --git a/spacy/tests/parser/test_noun_chunks.py b/spacy/tests/parser/test_noun_chunks.py index 5e8c7659a..ddebca8b8 100644 --- a/spacy/tests/parser/test_noun_chunks.py +++ b/spacy/tests/parser/test_noun_chunks.py @@ -47,6 +47,36 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer): assert chunks[1].text_with_ws == "another phrase " +def test_parser_noun_chunks_appositional_modifiers(en_tokenizer): + text = "Sam, my brother, arrived to the house." + heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4] + tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.'] + deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "Sam " + assert chunks[1].text_with_ws == "my brother " + assert chunks[2].text_with_ws == "the house " + + +def test_parser_noun_chunks_dative(en_tokenizer): + text = "She gave Bob a raise." + heads = [1, 0, -1, 1, -3, -4] + tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.'] + deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "She " + assert chunks[1].text_with_ws == "Bob " + assert chunks[2].text_with_ws == "a raise " + + def test_parser_noun_chunks_standard_de(de_tokenizer): text = "Eine Tasse steht auf dem Tisch." heads = [1, 1, 0, -1, 1, -2, -4] From 8b3e1f7b5b2d29ca3b70e5681daa095574b694be Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 29 Aug 2017 23:58:42 +0900 Subject: [PATCH 062/110] Handle out-of-vocab words Wasn't handling words out of the tokenizer dictionary vocabulary properly. This adds a fix and test for that. -POLM --- spacy/ja/__init__.py | 10 +++++++--- spacy/tests/ja/test_tagger.py | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index dfd0bca5b..2f85406c0 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text): node = node.next # first node is beginning of sentence and empty, skip it words = [] while node.posid != 0: + surface = node.surface + base = surface parts = node.feature.split(',') pos = ','.join(parts[0:4]) - reading = parts[6] - base = parts[7] - surface = parts[8] + + if len(parts) > 6: + # this information is only available for words in the tokenizer dictionary + reading = parts[6] + base = parts[7] words.append( ShortUnitWord(surface, base, pos) ) node = node.next diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py index 629cc795f..85f653836 100644 --- a/spacy/tests/ja/test_tagger.py +++ b/spacy/tests/ja/test_tagger.py @@ -22,7 +22,12 @@ TAGGER_TESTS = [ ('動詞,一般,*,*', 'VERB'), ('助詞,接続助詞,*,*', 'SCONJ'), ('動詞,非自立可能,*,*', 'VERB'), - ('助詞,終助詞,*,*', 'PART'))) + ('助詞,終助詞,*,*', 'PART'))), + ('プププランドに行きたい', + (('名詞,普通名詞,一般,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助動詞,*,*,*', 'AUX'))) ] @pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS) From a6d9fb5bb65066887e5a7e5d44b078e722b2b002 Mon Sep 17 00:00:00 2001 From: Vimos Tan Date: Wed, 30 Aug 2017 14:49:14 +0800 Subject: [PATCH 063/110] fix issue #1292 --- .../tokenizer/test_customized_tokenizer.py | 46 +++++++++++++++++++ spacy/tokenizer.pyx | 3 +- 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/tokenizer/test_customized_tokenizer.py diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py new file mode 100644 index 000000000..97a7db64c --- /dev/null +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ... import load +from ...tokenizer import Tokenizer +from ... import util + +import pytest + + +def test_customized_tokenizer_handles_infixes(): + def custom_tokenizer(nlp_model): + prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes) + suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes) + custom_infixes = ['\.\.\.+', + '(?<=[0-9])-(?=[0-9])', + # '(?<=[0-9]+),(?=[0-9]+)', + '[0-9]+(,[0-9]+)+', + u'[\[\]!&:,()\*—–\/-]'] + + infix_re = util.compile_infix_regex(custom_infixes) + + # infix_re = re.compile(ur'[\[\]!&:,()]') + + tokenizer = Tokenizer(nlp_model.vocab, + nlp_model.Defaults.tokenizer_exceptions, + prefix_re.search, + suffix_re.search, + infix_re.finditer, + token_match=None) + return lambda text: tokenizer(text) + + nlp = load('en', create_make_doc=custom_tokenizer) + + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion." + context = [word.text for word in nlp(sentence)] + assert context == [u'The', u'8', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', + u'for', + u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] + + # the trailing '-' may cause Assertion Error + sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion." + context = [word.text for word in nlp(sentence)] + assert context == [u'The', u'8', u'-', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', + u'for', + u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 276f0ef20..799e4bdaa 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -312,7 +312,8 @@ cdef class Tokenizer: start = infix_end span = string[start:] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + if span: + tokens.push_back(self.vocab.get(tokens.mem, span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): lexeme = deref(it) From 9bffcaa73df60794c63f428f5f83f06bd5a271e4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 1 Sep 2017 21:16:56 +0200 Subject: [PATCH 064/110] Update test to make it slightly more direct The `nlp` container should be unnecessary here. If so, we can test the tokenizer class just a little more directly. --- .../tokenizer/test_customized_tokenizer.py | 46 ++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py index 97a7db64c..695f8c649 100644 --- a/spacy/tests/tokenizer/test_customized_tokenizer.py +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -1,46 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals -from ... import load +from ...lang.en import English from ...tokenizer import Tokenizer from ... import util import pytest +@pytest.fixture +def tokenizer(en_vocab): + prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes) + suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes) + custom_infixes = ['\.\.\.+', + '(?<=[0-9])-(?=[0-9])', + # '(?<=[0-9]+),(?=[0-9]+)', + '[0-9]+(,[0-9]+)+', + u'[\[\]!&:,()\*—–\/-]'] -def test_customized_tokenizer_handles_infixes(): - def custom_tokenizer(nlp_model): - prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes) - suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes) - custom_infixes = ['\.\.\.+', - '(?<=[0-9])-(?=[0-9])', - # '(?<=[0-9]+),(?=[0-9]+)', - '[0-9]+(,[0-9]+)+', - u'[\[\]!&:,()\*—–\/-]'] - - infix_re = util.compile_infix_regex(custom_infixes) - - # infix_re = re.compile(ur'[\[\]!&:,()]') - - tokenizer = Tokenizer(nlp_model.vocab, - nlp_model.Defaults.tokenizer_exceptions, - prefix_re.search, - suffix_re.search, - infix_re.finditer, - token_match=None) - return lambda text: tokenizer(text) - - nlp = load('en', create_make_doc=custom_tokenizer) + infix_re = util.compile_infix_regex(custom_infixes) + return Tokenizer(en_vocab, + English.Defaults.tokenizer_exceptions, + prefix_re.search, + suffix_re.search, + infix_re.finditer, + token_match=None) +def test_customized_tokenizer_handles_infixes(tokenizer): sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion." - context = [word.text for word in nlp(sentence)] + context = [word.text for word in tokenizer(sentence)] assert context == [u'The', u'8', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', u'for', u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] # the trailing '-' may cause Assertion Error sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion." - context = [word.text for word in nlp(sentence)] + context = [word.text for word in tokenizer(sentence)] assert context == [u'The', u'8', u'-', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', u'for', u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] From d61c117081a57f7788e7e709abfd9adcd6e39df8 Mon Sep 17 00:00:00 2001 From: Eric Zhao Date: Sun, 3 Sep 2017 12:16:59 -0700 Subject: [PATCH 065/110] Lowest common ancestor matrix for spans and docs Added functionality for spans and docs to get lowest common ancestor matrix by simply calling: doc.get_lca_matrix() or doc[:3].get_lca_matrix(). Corresponding unit tests were also added under spacy/tests/doc and spacy/tests/spans. Designed to address: https://github.com/explosion/spaCy/issues/969. --- spacy/tests/doc/test_doc_api.py | 7 +++++ spacy/tests/spans/test_span.py | 11 +++++++ spacy/tokens/doc.pyx | 43 +++++++++++++++++++++++++++ spacy/tokens/span.pyx | 52 +++++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 1bc534ecd..d1a6316d5 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -216,6 +216,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): doc = en_tokenizer(text) assert doc.has_vector +def test_lowest_common_ancestor(en_tokenizer): + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc.get_lca_matrix() + assert(lca[1, 1] == 1) + assert(lca[0, 1] == 2) + assert(lca[1, 2] == 2) def test_parse_tree(en_tokenizer): """Tests doc.print_tree() method.""" diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index d22fa52ae..29aefe5c7 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -54,6 +54,17 @@ def test_spans_span_sent(doc): assert doc[6:7].sent.root.left_edge.text == 'This' +def test_spans_lca_matrix(en_tokenizer): + """Test span's lca matrix generation""" + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc[:2].get_lca_matrix() + assert(lca[0, 0] == 0) + assert(lca[0, 1] == -1) + assert(lca[1, 0] == -1) + assert(lca[1, 1] == 1) + + def test_spans_default_sentiment(en_tokenizer): """Test span.sentiment property's default averaging behaviour""" text = "good stuff bad stuff" diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ca5a3d696..aa888382e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -614,6 +614,49 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy doc. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + + def __pairwise_lca(token_j, token_k, lca_matrix): + if lca_matrix[token_j.i][token_k.i] != -2: + return lca_matrix[token_j.i][token_k.i] + elif token_j == token_k: + lca_index = token_j.i + elif token_k.head == token_j: + lca_index = token_j.i + elif token_j.head == token_k: + lca_index = token_k.i + elif (token_j.head == token_j) and (token_k.head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix) + lca_matrix[token_j.i][token_k.i] = lca_index + lca_matrix[token_k.i][token_j.i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + for j in range(len(self)): + token_j = self[j] + for k in range(len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + + def to_bytes(self): """ Serialize, producing a byte string. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index d8890addc..ae28f698a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -130,6 +130,58 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy span. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + + def __pairwise_lca(token_j, token_k, lca_matrix, margins): + offset = margins[0] + token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k + token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j + token_j_i = token_j.i - offset + token_k_i = token_k.i - offset + + if lca_matrix[token_j_i][token_k_i] != -2: + return lca_matrix[token_j_i][token_k_i] + elif token_j == token_k: + lca_index = token_j_i + elif token_k_head == token_j: + lca_index = token_j_i + elif token_j_head == token_k: + lca_index = token_k_i + elif (token_j_head == token_j) and (token_k_head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) + + lca_matrix[token_j_i][token_k_i] = lca_index + lca_matrix[token_k_i][token_j_i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + margins = [self.start, self.end] + + for j in range(len(self)): + token_j = self[j] + for k in range(len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + + + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \ From e8a26ebfabec51327b2948fba95d6fa87f77eaa5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 15:43:52 +0200 Subject: [PATCH 066/110] Add efficiency note to new get_lca_matrix() method --- spacy/tokens/doc.pyx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index aa888382e..aca35a73f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -626,7 +626,14 @@ cdef class Doc: impact on performance is negligible given the natural limitations on the depth of a typical human sentence. ''' - + # Efficiency notes: + # + # We can easily improve the performance here by iterating in Cython. + # To loop over the tokens in Cython, the easiest way is: + # for token in doc.c[:doc.c.length]: + # head = token + token.head + # Both token and head will be TokenC* here. The token.head attribute + # is an integer offset. def __pairwise_lca(token_j, token_k, lca_matrix): if lca_matrix[token_j.i][token_k.i] != -2: return lca_matrix[token_j.i][token_k.i] @@ -649,7 +656,7 @@ cdef class Doc: lca_matrix.fill(-2) for j in range(len(self)): token_j = self[j] - for k in range(len(self)): + for k in range(j, len(self)): token_k = self[k] lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) lca_matrix[k][j] = lca_matrix[j][k] From c68f188eb035ed67e2df905dd5e483f0261a8ace Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 18:59:36 +0200 Subject: [PATCH 067/110] Fix error on test --- spacy/tests/tokenizer/test_customized_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py index 695f8c649..19909ceba 100644 --- a/spacy/tests/tokenizer/test_customized_tokenizer.py +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from ...lang.en import English +from ...en import English from ...tokenizer import Tokenizer from ... import util From 45029a550e128e887fe1a6d826c04923991d98e2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 20:13:13 +0200 Subject: [PATCH 068/110] Fix customized-tokenizer tests --- spacy/tests/tokenizer/test_customized_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py index 19909ceba..855f3386c 100644 --- a/spacy/tests/tokenizer/test_customized_tokenizer.py +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -9,8 +9,8 @@ import pytest @pytest.fixture def tokenizer(en_vocab): - prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes) - suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes) + prefix_re = util.compile_prefix_regex(English.Defaults.prefixes) + suffix_re = util.compile_suffix_regex(English.Defaults.suffixes) custom_infixes = ['\.\.\.+', '(?<=[0-9])-(?=[0-9])', # '(?<=[0-9]+),(?=[0-9]+)', From 7692b8c071af51165c732474978b032ca85f262f Mon Sep 17 00:00:00 2001 From: Yu-chun Huang Date: Tue, 12 Sep 2017 16:23:47 +0800 Subject: [PATCH 069/110] Update __init__.py Set the "cut_all" parameter to False, or jieba will return ALL POSSIBLE word segmentations. --- spacy/zh/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py index 0f407dec6..bde0054b5 100644 --- a/spacy/zh/__init__.py +++ b/spacy/zh/__init__.py @@ -7,6 +7,6 @@ class Chinese(Language): def make_doc(self, text): import jieba - words = list(jieba.cut(text, cut_all=True)) + words = list(jieba.cut(text, cut_all=False)) words=[x for x in words if x] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From 1f1f35dcd07d419a2aca449c0ef738e098e37b68 Mon Sep 17 00:00:00 2001 From: Yu-chun Huang Date: Tue, 19 Sep 2017 16:57:24 +0800 Subject: [PATCH 070/110] Add Chinese punctuation Add Chinese punctuation. --- spacy/language_data/punctuation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index f23b15bbc..fe636fa4b 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -19,11 +19,13 @@ _CURRENCY = r""" _QUOTES = r""" ' '' " ” “ `` ` ‘ ´ ‚ , „ » « +「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 """ _PUNCT = r""" … , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & +。? ! , 、 ; : ~ """ From 188b439b25dbe020977761cc719efaf452e79423 Mon Sep 17 00:00:00 2001 From: Yu-chun Huang Date: Tue, 19 Sep 2017 16:58:42 +0800 Subject: [PATCH 071/110] Add Chinese punctuation Add Chinese punctuation. --- spacy/language_data/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index fe636fa4b..6229eff21 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -25,7 +25,7 @@ _QUOTES = r""" _PUNCT = r""" … , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & -。? ! , 、 ; : ~ +。 ? ! , 、 ; : ~ """ From 978b24ccd44a80f9ea2f8ae781e9b3a2164f68c4 Mon Sep 17 00:00:00 2001 From: Yam Date: Wed, 20 Sep 2017 23:02:22 +0800 Subject: [PATCH 072/110] Update punctuation.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Chinese, `~` and `——` is hyphens, `·` is intermittent symbol --- spacy/language_data/punctuation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 6229eff21..58ec73f2d 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -25,12 +25,12 @@ _QUOTES = r""" _PUNCT = r""" … , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & -。 ? ! , 、 ; : ~ +。 ? ! , 、 ; : ~ · """ _HYPHENS = r""" -- – — -- --- +- – — -- --- —— ~ """ From 44291f6697e3707c8730153c78cc547fc2e8f9e4 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 20 Sep 2017 23:26:34 +0700 Subject: [PATCH 073/110] add thai --- spacy/__init__.py | 5 +- spacy/th/__init__.py | 30 ++++++++++++ spacy/th/language_data.py | 25 ++++++++++ spacy/th/stop_words.py | 62 ++++++++++++++++++++++++ spacy/th/tag_map.py | 81 ++++++++++++++++++++++++++++++++ spacy/th/tokenizer_exceptions.py | 80 +++++++++++++++++++++++++++++++ 6 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 spacy/th/__init__.py create mode 100644 spacy/th/language_data.py create mode 100644 spacy/th/stop_words.py create mode 100644 spacy/th/tag_map.py create mode 100644 spacy/th/tokenizer_exceptions.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 3afb38cfb..f0d5ea0fc 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -7,12 +7,13 @@ from .cli.info import info from .glossary import explain from .about import __version__ -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese, + th.Thai) for _lang in _languages: diff --git a/spacy/th/__init__.py b/spacy/th/__init__.py new file mode 100644 index 000000000..0b6f8cf76 --- /dev/null +++ b/spacy/th/__init__.py @@ -0,0 +1,30 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .language_data import * +from ..language import Language, BaseDefaults +from ..attrs import LANG +from ..tokenizer import Tokenizer +from ..tokens import Doc +class ThaiDefaults(BaseDefaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'th' + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + tag_map = TAG_MAP + stop_words = set(STOP_WORDS) + + +class Thai(Language): + lang = 'th' + Defaults = ThaiDefaults + def make_doc(self, text): + try: + from pythainlp.tokenize import word_tokenize + except ImportError: + raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " + "https://github.com/wannaphongcom/pythainlp/") + words = [x for x in list(word_tokenize(text,"newmm"))] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + +__all__ = ['Thai'] \ No newline at end of file diff --git a/spacy/th/language_data.py b/spacy/th/language_data.py new file mode 100644 index 000000000..03800ba19 --- /dev/null +++ b/spacy/th/language_data.py @@ -0,0 +1,25 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +#from .tag_map import TAG_MAP +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) + +# export __all__ = ["TAG_MAP", "STOP_WORDS"] +__all__ = ["TAG_MAP", "STOP_WORDS","TOKENIZER_EXCEPTIONS"] \ No newline at end of file diff --git a/spacy/th/stop_words.py b/spacy/th/stop_words.py new file mode 100644 index 000000000..e13dec984 --- /dev/null +++ b/spacy/th/stop_words.py @@ -0,0 +1,62 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt +# stop words as whitespace-separated list +STOP_WORDS = set(""" +นี้ นํา นั้น นัก นอกจาก ทุก ที่สุด ที่ ทําให้ ทํา ทาง ทั้งนี้ ดัง ซึ่ง ช่วง จาก จัด จะ คือ ความ ครั้ง คง ขึ้น ของ +ขอ รับ ระหว่าง รวม ยัง มี มาก มา พร้อม พบ ผ่าน ผล บาง น่า เปิดเผย เปิด เนื่องจาก เดียวกัน เดียว เช่น เฉพาะ เข้า ถ้า +ถูก ถึง ต้อง ต่างๆ ต่าง ต่อ ตาม ตั้งแต่ ตั้ง ด้าน ด้วย อีก อาจ ออก อย่าง อะไร อยู่ อยาก หาก หลาย หลังจาก แต่ เอง เห็น +เลย เริ่ม เรา เมื่อ เพื่อ เพราะ เป็นการ เป็น หลัง หรือ หนึ่ง ส่วน ส่ง สุด สําหรับ ว่า ลง ร่วม ราย ขณะ ก่อน ก็ การ กับ กัน +กว่า กล่าว จึง ไว้ ไป ได้ ให้ ใน โดย แห่ง แล้ว และ แรก แบบ ๆ ทั้ง วัน เขา เคย ไม่ อยาก เกิน เกินๆ เกี่ยวกัน เกี่ยวกับ +เกี่ยวข้อง เกี่ยวเนื่อง เกี่ยวๆ เกือบ เกือบจะ เกือบๆ แก แก่ แก้ไข ใกล้ ใกล้ๆ ไกล ไกลๆ ขณะเดียวกัน ขณะใด ขณะใดๆ ขณะที่ ขณะนั้น ขณะนี้ ขณะหนึ่ง ขวาง +ขวางๆ ขั้น ใคร ใคร่ ใคร่จะ ใครๆ ง่าย ง่ายๆ ไง จง จด จน จนกระทั่ง จนกว่า จนขณะนี้ จนตลอด จนถึง จนทั่ว จนบัดนี้ จนเมื่อ จนแม้ จนแม้น +จรด จรดกับ จริง จริงจัง จริงๆ จริงๆจังๆ จวน จวนจะ จวนเจียน จวบ ซึ่งก็ ซึ่งก็คือ ซึ่งกัน ซึ่งกันและกัน ซึ่งได้แก่ ซึ่งๆ ณ ด้วย ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยประการฉะนี้ +ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดั่ง ดังกล่าว ดังกับ ดั่งกับ ดังกับว่า ดั่งกับว่า ดังเก่า +ดั่งเก่า ดังเคย ใดๆ ได้ ได้แก่ ได้แต่ ได้ที่ ได้มา ได้รับ ตน ตนเอง ตนฯ ตรง ตรงๆ ตลอด ตลอดกาล ตลอดกาลนาน ตลอดจน ตลอดถึง ตลอดทั้ง +ตลอดทั่ว ตลอดทั่วถึง ตลอดทั่วทั้ง ตลอดปี ตลอดไป ตลอดมา ตลอดระยะเวลา ตลอดวัน ตลอดเวลา ตลอดศก ต่อ ต่อกัน ถึงแก่ ถึงจะ ถึงบัดนั้น ถึงบัดนี้ +ถึงเมื่อ ถึงเมื่อใด ถึงเมื่อไร ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือ ถือว่า ถูกต้อง ถูกๆ เถอะ เถิด ทรง ทว่า ทั้งคน ทั้งตัว ทั้งที ทั้งที่ ทั้งนั้น ทั้งนั้นด้วย ทั้งนั้นเพราะ +นอก นอกจากที่ นอกจากนั้น นอกจากนี้ นอกจากว่า นอกนั้น นอกเหนือ นอกเหนือจาก น้อย น้อยกว่า น้อยๆ นะ น่ะ นักๆ นั่น นั่นไง นั่นเป็น นั่นแหละ +นั่นเอง นั้นๆ นับ นับจากนั้น นับจากนี้ นับตั้งแต่ นับแต่ นับแต่ที่ นับแต่นั้น เป็นต้น เป็นต้นไป เป็นต้นมา เป็นแต่ เป็นแต่เพียง เป็นที เป็นที่ เป็นที่สุด เป็นเพราะ +เป็นเพราะว่า เป็นเพียง เป็นเพียงว่า เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอันว่า เป็นอันๆ เป็นอาทิ เป็นๆ เปลี่ยน เปลี่ยนแปลง เปิด เปิดเผย ไป่ ผ่าน ผ่านๆ +ผิด ผิดๆ ผู้ เพียงเพื่อ เพียงไร เพียงไหน เพื่อที่ เพื่อที่จะ เพื่อว่า เพื่อให้ ภาค ภาคฯ ภาย ภายใต้ ภายนอก ภายใน ภายภาค ภายภาคหน้า ภายหน้า ภายหลัง +มอง มองว่า มัก มักจะ มัน มันๆ มั้ย มั้ยนะ มั้ยนั่น มั้ยเนี่ย มั้ยล่ะ ยืนนาน ยืนยง ยืนยัน ยืนยาว เยอะ เยอะแยะ เยอะๆ แยะ แยะๆ รวด รวดเร็ว ร่วม รวมกัน ร่วมกัน +รวมด้วย ร่วมด้วย รวมถึง รวมทั้ง ร่วมมือ รวมๆ ระยะ ระยะๆ ระหว่าง รับรอง รึ รึว่า รือ รือว่า สิ้นกาลนาน สืบเนื่อง สุดๆ สู่ สูง สูงกว่า สูงส่ง สูงสุด สูงๆ เสมือนกับ +เสมือนว่า เสร็จ เสร็จกัน เสร็จแล้ว เสร็จสมบูรณ์ เสร็จสิ้น เสีย เสียก่อน เสียจน เสียจนกระทั่ง เสียจนถึง เสียด้วย เสียนั่น เสียนั่นเอง เสียนี่ เสียนี่กระไร เสียยิ่ง +เสียยิ่งนัก เสียแล้ว ใหญ่ๆ ให้ดี ให้แด่ ให้ไป ใหม่ ให้มา ใหม่ๆ ไหน ไหนๆ อดีต อนึ่ง อย่าง อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น +อย่างนี้ อย่างโน้น ก็คือ ก็แค่ ก็จะ ก็ดี ก็ได้ ก็ต่อเมื่อ ก็ตาม ก็ตามแต่ ก็ตามที ก็แล้วแต่ กระทั่ง กระทำ กระนั้น กระผม กลับ กล่าวคือ กลุ่ม กลุ่มก้อน +กลุ่มๆ กว้าง กว้างขวาง กว้างๆ ก่อนหน้า ก่อนหน้านี้ ก่อนๆ กันดีกว่า กันดีไหม กันเถอะ กันนะ กันและกัน กันไหม กันเอง กำลัง กำลังจะ กำหนด กู เก็บ +เกิด เกี่ยวข้อง แก่ แก้ไข ใกล้ ใกล้ๆ ข้า ข้าง ข้างเคียง ข้างต้น ข้างบน ข้างล่าง ข้างๆ ขาด ข้าพเจ้า ข้าฯ เข้าใจ เขียน คงจะ คงอยู่ ครบ ครบครัน ครบถ้วน +ครั้งกระนั้น ครั้งก่อน ครั้งครา ครั้งคราว ครั้งใด ครั้งที่ ครั้งนั้น ครั้งนี้ ครั้งละ ครั้งหนึ่ง ครั้งหลัง ครั้งหลังสุด ครั้งไหน ครั้งๆ ครัน ครับ ครา คราใด คราที่ ครานั้น ครานี้ คราหนึ่ง +คราไหน คราว คราวก่อน คราวใด คราวที่ คราวนั้น คราวนี้ คราวโน้น คราวละ คราวหน้า คราวหนึ่ง คราวหลัง คราวไหน คราวๆ คล้าย คล้ายกัน คล้ายกันกับ +คล้ายกับ คล้ายกับว่า คล้ายว่า ควร ค่อน ค่อนข้าง ค่อนข้างจะ ค่อยไปทาง ค่อนมาทาง ค่อย ค่อยๆ คะ ค่ะ คำ คิด คิดว่า คุณ คุณๆ +เคยๆ แค่ แค่จะ แค่นั้น แค่นี้ แค่เพียง แค่ว่า แค่ไหน ใคร่ ใคร่จะ ง่าย ง่ายๆ จนกว่า จนแม้ จนแม้น จังๆ จวบกับ จวบจน จ้ะ จ๊ะ จะได้ จัง จัดการ จัดงาน จัดแจง +จัดตั้ง จัดทำ จัดหา จัดให้ จับ จ้า จ๋า จากนั้น จากนี้ จากนี้ไป จำ จำเป็น จำพวก จึงจะ จึงเป็น จู่ๆ ฉะนั้น ฉะนี้ ฉัน เฉกเช่น เฉย เฉยๆ ไฉน ช่วงก่อน +ช่วงต่อไป ช่วงถัดไป ช่วงท้าย ช่วงที่ ช่วงนั้น ช่วงนี้ ช่วงระหว่าง ช่วงแรก ช่วงหน้า ช่วงหลัง ช่วงๆ ช่วย ช้า ช้านาน ชาว ช้าๆ เช่นก่อน เช่นกัน เช่นเคย +เช่นดัง เช่นดังก่อน เช่นดังเก่า เช่นดังที่ เช่นดังว่า เช่นเดียวกัน เช่นเดียวกับ เช่นใด เช่นที่ เช่นที่เคย เช่นที่ว่า เช่นนั้น เช่นนั้นเอง เช่นนี้ เช่นเมื่อ เช่นไร เชื่อ +เชื่อถือ เชื่อมั่น เชื่อว่า ใช่ ใช่ไหม ใช้ ซะ ซะก่อน ซะจน ซะจนกระทั่ง ซะจนถึง ซึ่งได้แก่ ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น +ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดังกล่าว ดังกับว่า ดั่งกับว่า ดังเก่า ดั่งเก่า ดั่งเคย ต่างก็ ต่างหาก ตามด้วย ตามแต่ ตามที่ +ตามๆ เต็มไปด้วย เต็มไปหมด เต็มๆ แต่ก็ แต่ก่อน แต่จะ แต่เดิม แต่ต้อง แต่ถ้า แต่ทว่า แต่ที่ แต่นั้น แต่เพียง แต่เมื่อ แต่ไร แต่ละ แต่ว่า แต่ไหน แต่อย่างใด โต +โตๆ ใต้ ถ้าจะ ถ้าหาก ถึงแก่ ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือว่า ถูกต้อง ทว่า ทั้งนั้นด้วย ทั้งปวง ทั้งเป็น ทั้งมวล ทั้งสิ้น ทั้งหมด ทั้งหลาย ทั้งๆ ทัน +ทันใดนั้น ทันที ทันทีทันใด ทั่ว ทำไม ทำไร ทำให้ ทำๆ ที ที่จริง ที่ซึ่ง ทีเดียว ทีใด ที่ใด ที่ได้ ทีเถอะ ที่แท้ ที่แท้จริง ที่นั้น ที่นี้ ทีไร ทีละ ที่ละ +ที่แล้ว ที่ว่า ที่แห่งนั้น ที่ไหน ทีๆ ที่ๆ ทุกคน ทุกครั้ง ทุกครา ทุกคราว ทุกชิ้น ทุกตัว ทุกทาง ทุกที ทุกที่ ทุกเมื่อ ทุกวัน ทุกวันนี้ ทุกสิ่ง ทุกหน ทุกแห่ง ทุกอย่าง +ทุกอัน ทุกๆ เท่า เท่ากัน เท่ากับ เท่าใด เท่าที่ เท่านั้น เท่านี้ เท่าไร เท่าไหร่ แท้ แท้จริง เธอ นอกจากว่า น้อย น้อยกว่า น้อยๆ น่ะ นั้นไว นับแต่นี้ นาง +นางสาว น่าจะ นาน นานๆ นาย นำ นำพา นำมา นิด นิดหน่อย นิดๆ นี่ นี่ไง นี่นา นี่แน่ะ นี่แหละ นี้แหล่ นี่เอง นี้เอง นู่น นู้น เน้น เนี่ย +เนี่ยเอง ในช่วง ในที่ ในเมื่อ ในระหว่าง บน บอก บอกแล้ว บอกว่า บ่อย บ่อยกว่า บ่อยครั้ง บ่อยๆ บัดดล บัดเดี๋ยวนี้ บัดนั้น บัดนี้ บ้าง บางกว่า +บางขณะ บางครั้ง บางครา บางคราว บางที บางที่ บางแห่ง บางๆ ปฏิบัติ ประกอบ ประการ ประการฉะนี้ ประการใด ประการหนึ่ง ประมาณ ประสบ ปรับ +ปรากฏ ปรากฏว่า ปัจจุบัน ปิด เป็นด้วย เป็นดัง เป็นต้น เป็นแต่ เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอาทิ ผ่านๆ ผู้ ผู้ใด เผื่อ เผื่อจะ เผื่อที่ เผื่อว่า ฝ่าย +ฝ่ายใด พบว่า พยายาม พร้อมกัน พร้อมกับ พร้อมด้วย พร้อมทั้ง พร้อมที่ พร้อมเพียง พวก พวกกัน พวกกู พวกแก พวกเขา พวกคุณ พวกฉัน พวกท่าน +พวกที่ พวกเธอ พวกนั้น พวกนี้ พวกนู้น พวกโน้น พวกมัน พวกมึง พอ พอกัน พอควร พอจะ พอดี พอตัว พอที พอที่ พอเพียง พอแล้ว พอสม พอสมควร +พอเหมาะ พอๆ พา พึง พึ่ง พื้นๆ พูด เพราะฉะนั้น เพราะว่า เพิ่ง เพิ่งจะ เพิ่ม เพิ่มเติม เพียง เพียงแค่ เพียงใด เพียงแต่ เพียงพอ เพียงเพราะ +เพื่อว่า เพื่อให้ ภายใต้ มองว่า มั๊ย มากกว่า มากมาย มิ มิฉะนั้น มิใช่ มิได้ มีแต่ มึง มุ่ง มุ่งเน้น มุ่งหมาย เมื่อก่อน เมื่อครั้ง เมื่อครั้งก่อน +เมื่อคราวก่อน เมื่อคราวที่ เมื่อคราว เมื่อคืน เมื่อเช้า เมื่อใด เมื่อนั้น เมื่อนี้ เมื่อเย็น เมื่อไร เมื่อวันวาน เมื่อวาน เมื่อไหร่ แม้ แม้กระทั่ง แม้แต่ แม้นว่า แม้ว่า +ไม่ค่อย ไม่ค่อยจะ ไม่ค่อยเป็น ไม่ใช่ ไม่เป็นไร ไม่ว่า ยก ยกให้ ยอม ยอมรับ ย่อม ย่อย ยังคง ยังงั้น ยังงี้ ยังโง้น ยังไง ยังจะ ยังแต่ ยาก +ยาว ยาวนาน ยิ่ง ยิ่งกว่า ยิ่งขึ้น ยิ่งขึ้นไป ยิ่งจน ยิ่งจะ ยิ่งนัก ยิ่งเมื่อ ยิ่งแล้ว ยิ่งใหญ่ ร่วมกัน รวมด้วย ร่วมด้วย รือว่า เร็ว เร็วๆ เราๆ เรียก เรียบ เรื่อย +เรื่อยๆ ไร ล้วน ล้วนจน ล้วนแต่ ละ ล่าสุด เล็ก เล็กน้อย เล็กๆ เล่าว่า แล้วกัน แล้วแต่ แล้วเสร็จ วันใด วันนั้น วันนี้ วันไหน สบาย สมัย สมัยก่อน +สมัยนั้น สมัยนี้ สมัยโน้น ส่วนเกิน ส่วนด้อย ส่วนดี ส่วนใด ส่วนที่ ส่วนน้อย ส่วนนั้น ส่วนมาก ส่วนใหญ่ สั้น สั้นๆ สามารถ สำคัญ สิ่ง +สิ่งใด สิ่งนั้น สิ่งนี้ สิ่งไหน สิ้น เสร็จแล้ว เสียด้วย เสียแล้ว แสดง แสดงว่า หน หนอ หนอย หน่อย หมด หมดกัน หมดสิ้น หรือไง หรือเปล่า หรือไม่ หรือยัง +หรือไร หากแม้ หากแม้น หากแม้นว่า หากว่า หาความ หาใช่ หารือ เหตุ เหตุผล เหตุนั้น เหตุนี้ เหตุไร เห็นแก่ เห็นควร เห็นจะ เห็นว่า เหลือ เหลือเกิน เหล่า +เหล่านั้น เหล่านี้ แห่งใด แห่งนั้น แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น อย่างนี้ +อย่างโน้น อย่างมาก อย่างยิ่ง อย่างไร อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย อย่างละ อย่างหนึ่ง อย่างไหน อย่างๆ อัน อันจะ อันใด อันได้แก่ อันที่ +อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันไหน อันๆ อาจจะ อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ +""".split()) \ No newline at end of file diff --git a/spacy/th/tag_map.py b/spacy/th/tag_map.py new file mode 100644 index 000000000..e225f7289 --- /dev/null +++ b/spacy/th/tag_map.py @@ -0,0 +1,81 @@ +# encoding: utf8 +# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) +from __future__ import unicode_literals + +from ..symbols import * + +TAG_MAP = { + #NOUN + "NOUN": {POS: NOUN}, + "NCMN": {POS: NOUN}, + "NTTL": {POS: NOUN}, + "CNIT": {POS: NOUN}, + "CLTV": {POS: NOUN}, + "CMTR": {POS: NOUN}, + "CFQC": {POS: NOUN}, + "CVBL": {POS: NOUN}, + #PRON + "PRON": {POS: PRON}, + "NPRP": {POS: PRON}, + # ADJ + "ADJ": {POS: ADJ}, + "NONM": {POS: ADJ}, + "VATT": {POS: ADJ}, + "DONM": {POS: ADJ}, + # ADV + "ADV": {POS: ADV}, + "ADVN": {POS: ADV}, + "ADVI": {POS: ADV}, + "ADVP": {POS: ADV}, + "ADVS": {POS: ADV}, + # INT + "INT": {POS: INTJ}, + # PRON + "PROPN": {POS: PROPN}, + "PPRS": {POS: PROPN}, + "PDMN": {POS: PROPN}, + "PNTR": {POS: PROPN}, + # DET + "DET": {POS: DET}, + "DDAN": {POS: DET}, + "DDAC": {POS: DET}, + "DDBQ": {POS: DET}, + "DDAQ": {POS: DET}, + "DIAC": {POS: DET}, + "DIBQ": {POS: DET}, + "DIAQ": {POS: DET}, + "DCNM": {POS: DET}, + # NUM + "NUM": {POS: NUM}, + "NCNM": {POS: NUM}, + "NLBL": {POS: NUM}, + "DCNM": {POS: NUM}, + # AUX + "AUX": {POS: AUX}, + "XVBM": {POS: AUX}, + "XVAM": {POS: AUX}, + "XVMM": {POS: AUX}, + "XVBB": {POS: AUX}, + "XVAE": {POS: AUX}, + # ADP + "ADP": {POS: ADP}, + "RPRE": {POS: ADP}, + # CCONJ + "CCONJ": {POS: CCONJ}, + "JCRG": {POS: CCONJ}, + # SCONJ + "SCONJ": {POS: SCONJ}, + "PREL": {POS: SCONJ}, + "JSBR": {POS: SCONJ}, + "JCMP": {POS: SCONJ}, + # PART + "PART": {POS: PART}, + "FIXN": {POS: PART}, + "FIXV": {POS: PART}, + "EAFF": {POS: PART}, + "AITT": {POS: PART}, + "NEG": {POS: PART}, + # PUNCT + "PUNCT": {POS: PUNCT}, + "PUNC": {POS: PUNCT} +} \ No newline at end of file diff --git a/spacy/th/tokenizer_exceptions.py b/spacy/th/tokenizer_exceptions.py new file mode 100644 index 000000000..7e3967aed --- /dev/null +++ b/spacy/th/tokenizer_exceptions.py @@ -0,0 +1,80 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + "ม.ค.": [ + {ORTH: "ม.ค.", LEMMA: "มกราคม"} + ], + "ก.พ.": [ + {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"} + ], + "มี.ค.": [ + {ORTH: "มี.ค.", LEMMA: "มีนาคม"} + ], + "เม.ย.": [ + {ORTH: "เม.ย.", LEMMA: "เมษายน"} + ], + "พ.ค.": [ + {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"} + ], + "มิ.ย.": [ + {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"} + ], + "ก.ค.": [ + {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"} + ], + "ส.ค.": [ + {ORTH: "ส.ค.", LEMMA: "สิงหาคม"} + ], + "ก.ย.": [ + {ORTH: "ก.ย.", LEMMA: "กันยายน"} + ], + "ต.ค.": [ + {ORTH: "ต.ค.", LEMMA: "ตุลาคม"} + ], + "พ.ย.": [ + {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"} + ], + "ธ.ค.": [ + {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} + ] +} + + +# exceptions mapped to a single token containing only ORTH property +# example: {"string": [{ORTH: "string"}]} +# converted using strings_to_exc() util +''' +ORTH_ONLY = [ + "a.", + "b.", + "c.", + "d.", + "e.", + "f.", + "g.", + "h.", + "i.", + "j.", + "k.", + "l.", + "m.", + "n.", + "o.", + "p.", + "q.", + "r.", + "s.", + "t.", + "u.", + "v.", + "w.", + "x.", + "y.", + "z." +] +''' \ No newline at end of file From 39bb5690f0e1398b75407f70e89f88da4f9c3738 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 21 Sep 2017 00:36:02 +0700 Subject: [PATCH 074/110] update th --- spacy/th/__init__.py | 4 +--- spacy/th/tokenizer_exceptions.py | 37 +------------------------------- 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/spacy/th/__init__.py b/spacy/th/__init__.py index 0b6f8cf76..0ed5268c6 100644 --- a/spacy/th/__init__.py +++ b/spacy/th/__init__.py @@ -25,6 +25,4 @@ class Thai(Language): raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " "https://github.com/wannaphongcom/pythainlp/") words = [x for x in list(word_tokenize(text,"newmm"))] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) - -__all__ = ['Thai'] \ No newline at end of file + return Doc(self.vocab, words=words, spaces=[False]*len(words)) \ No newline at end of file diff --git a/spacy/th/tokenizer_exceptions.py b/spacy/th/tokenizer_exceptions.py index 7e3967aed..0f933f1c1 100644 --- a/spacy/th/tokenizer_exceptions.py +++ b/spacy/th/tokenizer_exceptions.py @@ -42,39 +42,4 @@ TOKENIZER_EXCEPTIONS = { "ธ.ค.": [ {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} ] -} - - -# exceptions mapped to a single token containing only ORTH property -# example: {"string": [{ORTH: "string"}]} -# converted using strings_to_exc() util -''' -ORTH_ONLY = [ - "a.", - "b.", - "c.", - "d.", - "e.", - "f.", - "g.", - "h.", - "i.", - "j.", - "k.", - "l.", - "m.", - "n.", - "o.", - "p.", - "q.", - "r.", - "s.", - "t.", - "u.", - "v.", - "w.", - "x.", - "y.", - "z." -] -''' \ No newline at end of file +} \ No newline at end of file From 1abf472068ef700c66da4dc0f4beadb3ccd7c718 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 21 Sep 2017 12:56:58 +0700 Subject: [PATCH 075/110] add th test --- spacy/tests/conftest.py | 6 ++++++ spacy/tests/th/test_tokenizer.py | 13 +++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 spacy/tests/th/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6e00b1513..c9652b08d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -15,6 +15,7 @@ from ..fi import Finnish from ..bn import Bengali from ..he import Hebrew from ..nb import Norwegian +from ..th import Thai from ..tokens import Doc @@ -101,6 +102,11 @@ def he_tokenizer(): def nb_tokenizer(): return Norwegian.Defaults.create_tokenizer() +@pytest.fixture +def th_tokenizer(): + pythainlp = pytest.importorskip("pythainlp") + return Thai.Defaults.create_tokenizer() + @pytest.fixture def stringstore(): return StringStore() diff --git a/spacy/tests/th/test_tokenizer.py b/spacy/tests/th/test_tokenizer.py new file mode 100644 index 000000000..851c6f067 --- /dev/null +++ b/spacy/tests/th/test_tokenizer.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +TOKENIZER_TESTS = [ + ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม']) +] + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_thai_tokenizer(th_tokenizer, text, expected_tokens): + tokens = [token.text for token in th_tokenizer(text)] + assert tokens == expected_tokens From 425c09488d1370d217b46521e2942b4b04a4e254 Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 08:56:34 +0800 Subject: [PATCH 076/110] Update word-vectors-similarities.jade add ``` import spacy nlp = spacy.load('en') ``` --- website/docs/usage/word-vectors-similarities.jade | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index 3cc0a67a8..3fd6326d1 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -21,10 +21,12 @@ p +code. import numpy + import spacy + nlp = spacy.load('en') apples, and_, oranges = nlp(u'apples and oranges') print(apples.vector.shape) - # (1,) + # (300,) apples.similarity(oranges) p From 923c4c2fb2863858c18d262de53746f42c9aa6ae Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 09:50:46 +0800 Subject: [PATCH 077/110] Update punctuation.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add `……` --- spacy/language_data/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 58ec73f2d..3b5307496 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -36,7 +36,7 @@ _HYPHENS = r""" LIST_ELLIPSES = [ r'\.\.+', - "…" + "… ……" ] From 6f450306c3429d19472e7ae25bcbcd7f8b835e2d Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 10:53:22 +0800 Subject: [PATCH 078/110] Update customizing-tokenizer.jade update some codes: - `me` -> `-PRON` - `TAG` -> `POS` - `create_tokenizer` function --- website/docs/usage/customizing-tokenizer.jade | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index ca5be9ef1..c7f717380 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -40,7 +40,9 @@ p { ORTH: u'me'}]) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] - assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] + # Pronoun lemma is returned as -PRON- + # More details please see: https://spacy.io/docs/usage/troubleshooting#pron-lemma + assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] p | The special case doesn't have to match an entire whitespace-delimited @@ -57,7 +59,7 @@ p +code. nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) + ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p @@ -172,12 +174,14 @@ p prefix_re = re.compile(r'''[\[\("']''') suffix_re = re.compile(r'''[\]\)"']''') + infix_re = re.compile(r'''[-~]''') def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, + return Tokenizer(nlp.vocab, rules={}, prefix_search=prefix_re.search, - suffix_search=suffix_re.search) + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer) - nlp = spacy.load('en', tokenizer=create_make_doc) + nlp = spacy.load('en', create_make_doc=create_tokenizer) p | If you need to subclass the tokenizer instead, the relevant methods to From 54855f0eee6707798caa58d41d192ec4401a5763 Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 12:15:48 +0800 Subject: [PATCH 079/110] Update customizing-tokenizer.jade --- website/docs/usage/customizing-tokenizer.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c7f717380..c2f840a27 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -59,7 +59,7 @@ p +code. nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}]) + ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p From b6ebedd09c03648c8bd3a448bd15ab87ce1631e4 Mon Sep 17 00:00:00 2001 From: Jeffrey Gerard Date: Mon, 25 Sep 2017 13:13:25 -0700 Subject: [PATCH 080/110] Document Tokenizer(token_match) and clarify tokenizer_pseudo_code Closes #835 In the `tokenizer_pseudo_code` I put the `special_cases` kwarg before `find_prefix` because this now matches the order the args are used in the pseudocode, and it also matches spacy's actual code. --- website/docs/usage/customizing-tokenizer.jade | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c2f840a27..173521a33 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -87,8 +87,8 @@ p | algorithm in Python, optimized for readability rather than performance: +code. - def tokenizer_pseudo_code(text, find_prefix, find_suffix, - find_infixes, special_cases): + def tokenizer_pseudo_code(text, special_cases, + find_prefix, find_suffix, find_infixes): tokens = [] for substring in text.split(' '): suffixes = [] @@ -140,7 +140,7 @@ p p | Let's imagine you wanted to create a tokenizer for a new language. There - | are four things you would need to define: + | are five things you would need to define: +list("numbers") +item @@ -162,6 +162,11 @@ p | A function #[code infixes_finditer], to handle non-whitespace | separators, such as hyphens etc. + +item + | (Optional) A boolean function #[code token_match] matching strings + | that should never be split, overriding the previous rules. + | Useful for things like URLs or numbers. + p | You shouldn't usually need to create a #[code Tokenizer] subclass. | Standard usage is to use #[code re.compile()] to build a regular @@ -175,11 +180,15 @@ p prefix_re = re.compile(r'''[\[\("']''') suffix_re = re.compile(r'''[\]\)"']''') infix_re = re.compile(r'''[-~]''') + simple_url_re = re.compile(r'''^https?://''') def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, rules={}, + return Tokenizer(nlp.vocab, + rules={}, prefix_search=prefix_re.search, suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer) + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match + ) nlp = spacy.load('en', create_make_doc=create_tokenizer) From 259ed027af0e4584956b7d00c37a3beb9d5b8d98 Mon Sep 17 00:00:00 2001 From: Vincent Genty Date: Tue, 26 Sep 2017 15:46:04 +0200 Subject: [PATCH 081/110] Fixed NER model loading bug --- spacy/syntax/parser.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index b9de1e114..48edb6d22 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -147,6 +147,9 @@ cdef class Parser: # TODO: remove this shim when we don't have to support older data if 'labels' in cfg and 'actions' not in cfg: cfg['actions'] = cfg.pop('labels') + # Convert string keys to int + if cfg.get('actions'): + cfg['actions'] = {int(action_name): labels for action_name, labels in cfg['actions'].items()} # TODO: remove this shim when we don't have to support older data for action_name, labels in dict(cfg.get('actions', {})).items(): # We need this to be sorted From a9362f1c73fd7197548f6d32ed997600d15f9ff2 Mon Sep 17 00:00:00 2001 From: Ondrej Kokes Date: Wed, 4 Oct 2017 12:55:07 +0200 Subject: [PATCH 082/110] Fixing links to SyntaxNet --- website/docs/api/index.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade index 24f3d4458..7e3f1a906 100644 --- a/website/docs/api/index.jade +++ b/website/docs/api/index.jade @@ -6,7 +6,7 @@ include ../../_includes/_mixins p | Here's a quick comparison of the functionalities offered by spaCy, - | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet], + | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet], | #[+a("http://www.nltk.org/py-modindex.html") NLTK] and | #[+a("http://stanfordnlp.github.io/CoreNLP/") CoreNLP]. @@ -107,7 +107,7 @@ p p | In 2016, Google released their - | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet] + | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet] | library, setting a new state of the art for syntactic dependency parsing | accuracy. SyntaxNet's algorithm is very similar to spaCy's. The main | difference is that SyntaxNet uses a neural network while spaCy uses a @@ -129,7 +129,7 @@ p +cell=data +row - +cell #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") Parsey McParseface] + +cell #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") Parsey McParseface] each data in [ 94.15, 89.08, 94.77 ] +cell=data From e81a608173e78b10da5984cf0d2632de29f407f1 Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:47:48 -0400 Subject: [PATCH 083/110] Regression test for lemmatizer exceptions -- demonstrate issue #1387 --- spacy/tests/regression/test_issue1387.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 spacy/tests/regression/test_issue1387.py diff --git a/spacy/tests/regression/test_issue1387.py b/spacy/tests/regression/test_issue1387.py new file mode 100644 index 000000000..c5f01d145 --- /dev/null +++ b/spacy/tests/regression/test_issue1387.py @@ -0,0 +1,22 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc + +import pytest + +def test_issue1387(): + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" + From ffb50d21a043a1028a7a8ac3f354483ec100fce6 Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:49:02 -0400 Subject: [PATCH 084/110] Lemmatizer honors exceptions: Fix #1387 --- spacy/lemmatizer.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d7541c56b..1112bcee3 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -78,15 +78,16 @@ def lemmatize(string, index, exceptions, rules): # forms.append(string) forms.extend(exceptions.get(string, [])) oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) if not forms: forms.extend(oov_forms) if not forms: From b0d271809dab5146fdc45cfcfab2e467b8a9347e Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:49:28 -0400 Subject: [PATCH 085/110] Unit test for lemmatizer exceptions -- copied from regression test for #1387 --- spacy/tests/tagger/test_lemmatizer.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 5db0d0b2c..91ed7d2f1 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -47,3 +47,20 @@ def test_tagger_lemmatizer_lemma_assignment(EN): assert all(t.lemma_ == '' for t in doc) EN.tagger(doc) assert all(t.lemma_ != '' for t in doc) + + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc +def test_tagger_lemmatizer_exceptions(): + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" From e77d8886f7bad951341060fee328eaa7ab4e927e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Oct 2017 22:22:04 +0200 Subject: [PATCH 086/110] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 995f6901f..97c53c3d2 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -26,7 +26,9 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Ines Montani, [@ines](https://github.com/ines) * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) +* Jim Geovedi, [@geovedi](https://github.com/geovedi) * Jim Regan, [@jimregan](https://github.com/jimregan) +* Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG) * Jordan Suchow, [@suchow](https://github.com/suchow) * Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) @@ -41,6 +43,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Michael Wallin, [@wallinm1](https://github.com/wallinm1) * Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) +* Paul O'Leary McCann, [@polm](https://github.com/polm) * Pokey Rule, [@pokey](https://github.com/pokey) * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort) @@ -51,11 +54,15 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Swier, [@swierh](https://github.com/swierh) * Thomas Tanon, [@Tpt](https://github.com/Tpt) * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues) +* Vimos Tan, [@Vimos](https://github.com/Vimos) * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov) * Wah Loon Keng, [@kengz](https://github.com/kengz) +* Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom) * Willem van Hage, [@wrvhage](https://github.com/wrvhage) * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker) +* Yam, [@hscspring](https://github.com/hscspring) * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) * Yasuaki Uechi, [@uetchy](https://github.com/uetchy) +* Yu-chun Huang, [@galaxyh](https://github.com/galaxyh) * Yubing Dong, [@tomtung](https://github.com/tomtung) * Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter) From e04e11070f78ea827ddce40e62ee9ce8c7f38489 Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 17:45:45 -0400 Subject: [PATCH 087/110] Contributor agreement for Orion Montoya @mdcclv --- .github/contributors/mdcclv.md | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/mdcclv.md diff --git a/.github/contributors/mdcclv.md b/.github/contributors/mdcclv.md new file mode 100644 index 000000000..14ebfae26 --- /dev/null +++ b/.github/contributors/mdcclv.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------------- | +| Name | Orion Montoya | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 04-10-2017 | +| GitHub username | mdcclv | +| Website (optional) | http://www.mdcclv.com/ | From 763b54cbc38120f63c308b4d519c9fb2cb2408ae Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 6 Oct 2017 16:30:44 +0700 Subject: [PATCH 088/110] Update adding-languages.jade Fixed misspellings --- website/docs/usage/adding-languages.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 7d893b4eb..02dfb79ca 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -525,13 +525,13 @@ p | └── oov_prob # optional ├── pos/ # optional | ├── model # via nlp.tagger.model.dump(path) - | └── config.json # via Langage.train + | └── config.json # via Language.train ├── deps/ # optional | ├── model # via nlp.parser.model.dump(path) - | └── config.json # via Langage.train + | └── config.json # via Language.train └── ner/ # optional ├── model # via nlp.entity.model.dump(path) - └── config.json # via Langage.train + └── config.json # via Language.train p | This creates a spaCy data directory with a vocabulary model, ready to be From e89689a31d69180b9ee22603b488a3594a8383dc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 6 Oct 2017 18:02:40 +0200 Subject: [PATCH 089/110] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 97c53c3d2..9e210bd4c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -43,6 +43,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Michael Wallin, [@wallinm1](https://github.com/wallinm1) * Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) +* Orion Montoya, [@mdcclv](https://github.com/mdcclv) * Paul O'Leary McCann, [@polm](https://github.com/polm) * Pokey Rule, [@pokey](https://github.com/pokey) * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) From efe0800f91dd35d114cbcdf64845bdafa34de9f5 Mon Sep 17 00:00:00 2001 From: Yam Date: Mon, 9 Oct 2017 21:39:15 -0500 Subject: [PATCH 090/110] Update training.jade fix several changes --- website/docs/usage/training.jade | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 8a5c111bd..3a15ae2a1 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -33,12 +33,14 @@ p from spacy.vocab import Vocab from spacy.pipeline import EntityRecognizer from spacy.tokens import Doc + from spacy.gold import GoldParse vocab = Vocab() entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC']) doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) - entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O']) + gold = GoldParse(doc, entities=['O', 'O', 'B-PERSON', 'L-PERSON', 'O']) + entity.update(doc, gold) entity.model.end_training() @@ -65,13 +67,14 @@ p.o-inline-list from spacy.vocab import Vocab from spacy.pipeline import DependencyParser from spacy.tokens import Doc + from spacy.gold import GoldParse vocab = Vocab() parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct']) doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) - parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'), - (1, 'punct')]) + gold = GoldParse(doc, [1,1,3,1,1], ['nsubj', 'ROOT', 'compound', 'dobj', 'punct']) + parser.update(doc, gold) parser.model.end_training() @@ -120,7 +123,7 @@ p +code. from spacy.vocab import Vocab - from spacy.pipeline import Tagger + from spacy.tagger import Tagger from spacy.tagger import P2_orth, P1_orth from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth From 3452d6ce521943fb0bb02f59d3d9e3a1bac218c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 11 Oct 2017 11:24:00 +0200 Subject: [PATCH 091/110] Resolve issue #1078 by simplifying URL pattern - avoid catastrophic backtracking - reduce character range of host name, domain name and TLD identifier --- spacy/language_data/tokenizer_exceptions.py | 6 +++--- spacy/tests/tokenizer/test_urls.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index b84adb2c4..9d5187d83 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -32,11 +32,11 @@ _URL_PATTERN = ( r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name - r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" + r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)" # domain name - r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" + r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*" # TLD identifier - r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" + r"(?:\.(?:[a-z]{2,}))" r")" # port number r"(?::\d{2,5})?" diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 959067110..3bb6521f1 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -33,13 +33,10 @@ URLS_SHOULD_MATCH = [ "http://userid:password@example.com/", "http://142.42.1.1/", "http://142.42.1.1:8080/", - "http://⌘.ws", - "http://⌘.ws/", "http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)_blah#cite-1", "http://foo.com/unicode_(✪)_in_parens", "http://foo.com/(something)?after=parens", - "http://☺.damowmow.com/", "http://code.google.com/events/#&product=browser", "http://j.mp", "ftp://foo.bar/baz", @@ -49,14 +46,17 @@ URLS_SHOULD_MATCH = [ "http://a.b-c.de", "http://223.255.255.254", "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 - "http://✪df.ws/123", - "http://➡.ws/䨹", - "http://مثال.إختبار", - "http://例子.测试", - "http://उदाहरण.परीक्षा", pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"), pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"), + pytest.mark.xfail("http://⌘.ws"), + pytest.mark.xfail("http://⌘.ws/"), + pytest.mark.xfail("http://☺.damowmow.com/"), + pytest.mark.xfail("http://✪df.ws/123"), + pytest.mark.xfail("http://➡.ws/䨹"), + pytest.mark.xfail("http://مثال.إختبار"), + pytest.mark.xfail("http://例子.测试"), + pytest.mark.xfail("http://उदाहरण.परीक्षा"), ] URLS_SHOULD_NOT_MATCH = [ @@ -83,7 +83,6 @@ URLS_SHOULD_NOT_MATCH = [ "http://foo.bar/foo(bar)baz quux", "ftps://foo.bar/", "http://-error-.invalid/", - "http://-a.b.co", "http://a.b-.co", "http://0.0.0.0", "http://10.1.1.0", @@ -99,6 +98,7 @@ URLS_SHOULD_NOT_MATCH = [ pytest.mark.xfail("foo.com"), pytest.mark.xfail("http://1.1.1.1.1"), pytest.mark.xfail("http://www.foo.bar./"), + pytest.mark.xfail("http://-a.b.co"), ] From 2a78f4d6345084fda788a7f94beff963026b0e83 Mon Sep 17 00:00:00 2001 From: yuukos Date: Thu, 12 Oct 2017 22:23:19 +0700 Subject: [PATCH 092/110] updated .gitignore file added excluding PyCharm's idea directory --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 84ced41f8..ecd8ed39f 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,7 @@ Desktop.ini # Other *.tgz + + +# JetBrains PyCharm +.idea/ \ No newline at end of file From 7b9491679ffa235ce6cc3f8d3f94b00c14d40655 Mon Sep 17 00:00:00 2001 From: yuukos Date: Thu, 12 Oct 2017 22:24:20 +0700 Subject: [PATCH 093/110] added russian language support --- spacy/ru/__init__.py | 56 ++++++++++++++++++++++++++++++++ spacy/ru/language_data.py | 18 ++++++++++ spacy/ru/stop_words.py | 54 ++++++++++++++++++++++++++++++ spacy/ru/tokenizer_exceptions.py | 29 +++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 spacy/ru/__init__.py create mode 100644 spacy/ru/language_data.py create mode 100644 spacy/ru/stop_words.py create mode 100644 spacy/ru/tokenizer_exceptions.py diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py new file mode 100644 index 000000000..d8f38e199 --- /dev/null +++ b/spacy/ru/__init__.py @@ -0,0 +1,56 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc +from .language_data import * + + +class RussianTokenizer(object): + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Russian tokenizer requires the pymorphy2 library: " + "try to fix it with " + "pip install pymorphy2==0.8") + + _morph = MorphAnalyzer() + + def __init__(self, spacy_tokenizer, cls, nlp=None): + self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) + self._spacy_tokenizer = spacy_tokenizer + + def __call__(self, text): + words = [self._normalize(RussianTokenizer._get_word(token)) + for token in self._spacy_tokenizer(text)] + + return Doc(self.vocab, words, [False] * len(words)) + + @staticmethod + def _get_word(token): + return token.lemma_ if len(token.lemma_) > 0 else token.text + + @classmethod + def _normalize(cls, word): + return cls._morph.parse(word)[0].normal_form + + +class RussianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ru' + + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + + @classmethod + def create_tokenizer(cls, nlp=None): + tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp) + return RussianTokenizer(tokenizer, cls, nlp) + + +class Russian(Language): + lang = 'ru' + + Defaults = RussianDefaults diff --git a/spacy/ru/language_data.py b/spacy/ru/language_data.py new file mode 100644 index 000000000..75ca41b65 --- /dev/null +++ b/spacy/ru/language_data.py @@ -0,0 +1,18 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc + +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) + + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) + + +__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] \ No newline at end of file diff --git a/spacy/ru/stop_words.py b/spacy/ru/stop_words.py new file mode 100644 index 000000000..ddb28af86 --- /dev/null +++ b/spacy/ru/stop_words.py @@ -0,0 +1,54 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +а + +будем будет будете будешь буду будут будучи будь будьте бы был была были было +быть + +в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею +всея всю вся вы + +да для до + +его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею + +же + +за + +и из или им ими имъ их + +к как кем ко когда кого ком кому комья которая которого которое которой котором +которому которою которую которые который которым которыми которых кто + +меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего +моей моем моём моему моею можем может можете можешь мои мой моим моими моих +мочь мою моя мы + +на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим +нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но + +о об один одна одни одним одними одних одно одного одной одном одному одною +одну он она оне они оно от + +по при + +с сам сама сами самим самими самих само самого самом самому саму свое своё +своего своей своем своём своему своею свои свой своим своими своих свою своя +себе себя собой собою + +та так такая такие таким такими таких такого такое такой таком такому такою +такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому +тот тою ту ты + +у уже + +чего чем чём чему что чтобы + +эта эти этим этими этих это этого этой этом этому этот этою эту + +я +""".split()) \ No newline at end of file diff --git a/spacy/ru/tokenizer_exceptions.py b/spacy/ru/tokenizer_exceptions.py new file mode 100644 index 000000000..8df57a402 --- /dev/null +++ b/spacy/ru/tokenizer_exceptions.py @@ -0,0 +1,29 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TOKENIZER_EXCEPTIONS = { + "Пн.": [ + {ORTH: "Пн.", LEMMA: "Понедельник"} + ], + "Вт.": [ + {ORTH: "Вт.", LEMMA: "Вторник"} + ], + "Ср.": [ + {ORTH: "Ср.", LEMMA: "Среда"} + ], + "Чт.": [ + {ORTH: "Чт.", LEMMA: "Четверг"} + ], + "Пт.": [ + {ORTH: "Пт.", LEMMA: "Пятница"} + ], + "Сб.": [ + {ORTH: "Сб.", LEMMA: "Суббота"} + ], + "Вс.": [ + {ORTH: "Вс.", LEMMA: "Воскресенье"} + ], +} \ No newline at end of file From f81dd284eb2e8c09c55a4fc37abb3e00e278f0a8 Mon Sep 17 00:00:00 2001 From: yuukos Date: Thu, 12 Oct 2017 22:28:34 +0700 Subject: [PATCH 094/110] updated spacy/__init__.py registered russian language via set_lang_class --- spacy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index f0d5ea0fc..1e5faf504 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -7,13 +7,13 @@ from .cli.info import info from .glossary import explain from .about import __version__ -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th, ru _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese, - th.Thai) + th.Thai, ru.Russian) for _lang in _languages: From 622b6d627078f5a5bc14ebb2840a64ec3db5d118 Mon Sep 17 00:00:00 2001 From: yuukos Date: Fri, 13 Oct 2017 13:57:29 +0700 Subject: [PATCH 095/110] updated Russian tokenizer moved the trying to import pymorph into __init__ --- spacy/ru/__init__.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py index d8f38e199..12b480a8a 100644 --- a/spacy/ru/__init__.py +++ b/spacy/ru/__init__.py @@ -8,17 +8,19 @@ from .language_data import * class RussianTokenizer(object): - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Russian tokenizer requires the pymorphy2 library: " - "try to fix it with " - "pip install pymorphy2==0.8") - - _morph = MorphAnalyzer() + _morph = None def __init__(self, spacy_tokenizer, cls, nlp=None): + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Russian tokenizer requires the pymorphy2 library: " + "try to fix it with " + "pip install pymorphy2==0.8") + + RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer) + self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) self._spacy_tokenizer = spacy_tokenizer @@ -36,6 +38,12 @@ class RussianTokenizer(object): def _normalize(cls, word): return cls._morph.parse(word)[0].normal_form + @classmethod + def _create_morph(cls, morph_analyzer_class): + if not cls._morph: + cls._morph = morph_analyzer_class() + return cls._morph + class RussianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) From a229b6e0ded3b1255fd77e00c197fa35c9030e5b Mon Sep 17 00:00:00 2001 From: yuukos Date: Fri, 13 Oct 2017 14:04:37 +0700 Subject: [PATCH 096/110] added tests for Russian language added tests of creating Russian Language instance and Russian tokenizer --- spacy/tests/conftest.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 90b947702..718a8265c 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -16,7 +16,7 @@ from ..bn import Bengali from ..he import Hebrew from ..nb import Norwegian from ..th import Thai - +from ..ru import Russian from ..tokens import Doc from ..strings import StringStore @@ -30,7 +30,7 @@ import pytest # These languages get run through generic tokenizer tests LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, - Swedish, Hungarian, Finnish, Bengali, Norwegian] + Swedish, Hungarian, Finnish, Bengali, Norwegian, Russian] @pytest.fixture(params=LANGUAGES) @@ -53,6 +53,7 @@ def en_vocab(): def en_parser(): return English.Defaults.create_parser() + @pytest.fixture def es_tokenizer(): return Spanish.Defaults.create_tokenizer() @@ -83,11 +84,13 @@ def ja_tokenizer(): pytest.importorskip("MeCab") return Japanese.Defaults.create_tokenizer() + @pytest.fixture def japanese(): pytest.importorskip("MeCab") return Japanese() + @pytest.fixture def sv_tokenizer(): return Swedish.Defaults.create_tokenizer() @@ -102,15 +105,30 @@ def bn_tokenizer(): def he_tokenizer(): return Hebrew.Defaults.create_tokenizer() + @pytest.fixture def nb_tokenizer(): return Norwegian.Defaults.create_tokenizer() + @pytest.fixture def th_tokenizer(): pythainlp = pytest.importorskip("pythainlp") return Thai.Defaults.create_tokenizer() + +@pytest.fixture +def ru_tokenizer(): + pytest.importorskip("pymorphy2") + return Russian.Defaults.create_tokenizer() + + +@pytest.fixture +def russian(): + pytest.importorskip("pymorphy2") + return Russian() + + @pytest.fixture def stringstore(): return StringStore() @@ -118,7 +136,7 @@ def stringstore(): @pytest.fixture def en_entityrecognizer(): - return English.Defaults.create_entity() + return English.Defaults.create_entity() @pytest.fixture @@ -130,6 +148,7 @@ def lemmatizer(): def text_file(): return StringIO() + @pytest.fixture def text_file_b(): return BytesIO() @@ -149,11 +168,11 @@ def DE(): def pytest_addoption(parser): parser.addoption("--models", action="store_true", - help="include tests that require full models") + help="include tests that require full models") parser.addoption("--vectors", action="store_true", - help="include word vectors tests") + help="include word vectors tests") parser.addoption("--slow", action="store_true", - help="include slow tests") + help="include slow tests") def pytest_runtest_setup(item): From 6fb9d75bd2a9ed049300b4237bec23d7a09e6845 Mon Sep 17 00:00:00 2001 From: yuukos Date: Fri, 13 Oct 2017 15:51:03 +0700 Subject: [PATCH 097/110] fixed test with creating tokenizer --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 718a8265c..de0facf49 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -30,7 +30,7 @@ import pytest # These languages get run through generic tokenizer tests LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, - Swedish, Hungarian, Finnish, Bengali, Norwegian, Russian] + Swedish, Hungarian, Finnish, Bengali, Norwegian] @pytest.fixture(params=LANGUAGES) From ce00405afc176bd02363a7d703c3e61ef52fb851 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 13 Oct 2017 21:00:15 +0700 Subject: [PATCH 098/110] Create yuukos.md --- .github/contributors/yuukos.md | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/yuukos.md diff --git a/.github/contributors/yuukos.md b/.github/contributors/yuukos.md new file mode 100644 index 000000000..aecafeecb --- /dev/null +++ b/.github/contributors/yuukos.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alexey Kim | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 13-12-2017 | +| GitHub username | yuukos | +| Website (optional) | | From 95836abee1c311bb95d291d0357f29b9f4e98e1c Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 13 Oct 2017 21:02:19 +0700 Subject: [PATCH 099/110] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 9e210bd4c..edd1ed30d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -3,6 +3,7 @@ This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work! * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer) +* Alexey Kim, [@yuukos](https://github.com/yuukos) * Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman) * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv) * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) From a31d33be06b3a2c933bb1b0d4859778616065cb8 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sat, 14 Oct 2017 19:28:04 +0900 Subject: [PATCH 100/110] Contributor agreement --- .github/contributors/polm.md | 106 +++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/polm.md diff --git a/.github/contributors/polm.md b/.github/contributors/polm.md new file mode 100644 index 000000000..a2aa0cb65 --- /dev/null +++ b/.github/contributors/polm.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Paul McCann | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-14 | +| GitHub username | polm | +| Website (optional) | http://dampfkraft.com| From 43eedf73f2aaf506e158115dfb328fb60bd91943 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sun, 15 Oct 2017 23:33:25 +0900 Subject: [PATCH 101/110] [ja] Stash tokenizer output for speed Before this commit, the Mecab tokenizer had to be called twice when creating a Doc- once during tokenization and once during tagging. This creates a JapaneseDoc wrapper class for Doc that stashes the parsed tokenizer output to remove redundant processing. -POLM --- spacy/ja/__init__.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 2f85406c0..b2ec281f7 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -16,6 +16,13 @@ from collections import namedtuple ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) +class JapaneseDoc(Doc): + def __init__(self, detailed_tokens, vocab, words=None, spaces=None, orths_and_spaces=None): + super(JapaneseDoc, self).__init__(vocab, words, spaces, orths_and_spaces) + # This saves tokenizer output so mecab doesn't have to be called again + # when determining POS tags. + self.detailed_tokens = detailed_tokens + def try_mecab_import(): """Mecab is required for Japanese support, so check for it. @@ -34,8 +41,9 @@ class JapaneseTokenizer(object): self.tokenizer = MeCab.Tagger() def __call__(self, text): - words = [x.surface for x in detailed_tokens(self.tokenizer, text)] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + dtokens = detailed_tokens(self.tokenizer, text) + words = [x.surface for x in dtokens] + return JapaneseDoc(dtokens, self.vocab, words=words, spaces=[False]*len(words)) def resolve_pos(token): """If necessary, add a field to the POS tag for UD mapping. @@ -91,7 +99,7 @@ class JapaneseTagger(object): # 1. get raw JP tags # 2. add features to tags as necessary for UD - dtokens = detailed_tokens(self.tokenizer, tokens.text) + dtokens = tokens.detailed_tokens rawtags = list(map(resolve_pos, dtokens)) self.tagger.tag_from_strings(tokens, rawtags) @@ -112,8 +120,7 @@ class Japanese(Language): Defaults = JapaneseDefaults def make_doc(self, text): - words = [str(t) for t in self.tokenizer(text)] - doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + jdoc = self.tokenizer(text) tagger = JapaneseDefaults.create_tagger(self.tokenizer) - tagger(doc) - return doc + tagger(jdoc) + return jdoc From 71ae8013ec5e981c9b44699afd82162c6f6c625b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 16 Oct 2017 00:24:34 +0900 Subject: [PATCH 102/110] [ja] Use user_details instead of a wrapper class Instead of using a JapaneseDoc wrapper class to store Mecab output, stash it in `user_data`. -POLM --- spacy/ja/__init__.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index b2ec281f7..26e39a593 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -16,12 +16,7 @@ from collections import namedtuple ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) -class JapaneseDoc(Doc): - def __init__(self, detailed_tokens, vocab, words=None, spaces=None, orths_and_spaces=None): - super(JapaneseDoc, self).__init__(vocab, words, spaces, orths_and_spaces) - # This saves tokenizer output so mecab doesn't have to be called again - # when determining POS tags. - self.detailed_tokens = detailed_tokens +DETAILS_KEY = 'mecab_details' def try_mecab_import(): """Mecab is required for Japanese support, so check for it. @@ -43,7 +38,10 @@ class JapaneseTokenizer(object): def __call__(self, text): dtokens = detailed_tokens(self.tokenizer, text) words = [x.surface for x in dtokens] - return JapaneseDoc(dtokens, self.vocab, words=words, spaces=[False]*len(words)) + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + # stash details tokens for tagger to use + doc.user_data[DETAILS_KEY] = dtokens + return doc def resolve_pos(token): """If necessary, add a field to the POS tag for UD mapping. @@ -99,7 +97,7 @@ class JapaneseTagger(object): # 1. get raw JP tags # 2. add features to tags as necessary for UD - dtokens = tokens.detailed_tokens + dtokens = tokens.user_data[DETAILS_KEY] rawtags = list(map(resolve_pos, dtokens)) self.tagger.tag_from_strings(tokens, rawtags) From 241d19a3e6f78918bc8296d574a1e65e4ce9381f Mon Sep 17 00:00:00 2001 From: yuukos Date: Mon, 16 Oct 2017 13:37:05 +0700 Subject: [PATCH 103/110] fixed Russian Tokenizer - added trailing space flags for tokens --- spacy/ru/__init__.py | 20 +++++++++++++++++--- spacy/ru/language_data.py | 2 +- spacy/ru/stop_words.py | 2 +- spacy/ru/tokenizer_exceptions.py | 3 ++- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py index 12b480a8a..8789cd6e5 100644 --- a/spacy/ru/__init__.py +++ b/spacy/ru/__init__.py @@ -25,15 +25,29 @@ class RussianTokenizer(object): self._spacy_tokenizer = spacy_tokenizer def __call__(self, text): - words = [self._normalize(RussianTokenizer._get_word(token)) - for token in self._spacy_tokenizer(text)] + get_norm = RussianTokenizer._get_norm + has_space = RussianTokenizer._has_space - return Doc(self.vocab, words, [False] * len(words)) + words_with_space_flags = [(get_norm(token), has_space(token, text)) + for token in self._spacy_tokenizer(text)] + + words, spaces = map(lambda s: list(s), zip(*words_with_space_flags)) + + return Doc(self.vocab, words, spaces) @staticmethod def _get_word(token): return token.lemma_ if len(token.lemma_) > 0 else token.text + @staticmethod + def _has_space(token, text): + pos_after_token = token.idx + len(token.text) + return pos_after_token < len(text) and text[pos_after_token] == ' ' + + @classmethod + def _get_norm(cls, token): + return cls._normalize(cls._get_word(token)) + @classmethod def _normalize(cls, word): return cls._morph.parse(word)[0].normal_form diff --git a/spacy/ru/language_data.py b/spacy/ru/language_data.py index 75ca41b65..d33d388fd 100644 --- a/spacy/ru/language_data.py +++ b/spacy/ru/language_data.py @@ -15,4 +15,4 @@ TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) -__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] \ No newline at end of file +__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] diff --git a/spacy/ru/stop_words.py b/spacy/ru/stop_words.py index ddb28af86..2d89b7726 100644 --- a/spacy/ru/stop_words.py +++ b/spacy/ru/stop_words.py @@ -51,4 +51,4 @@ STOP_WORDS = set(""" эта эти этим этими этих это этого этой этом этому этот этою эту я -""".split()) \ No newline at end of file +""".split()) diff --git a/spacy/ru/tokenizer_exceptions.py b/spacy/ru/tokenizer_exceptions.py index 8df57a402..f444f3df6 100644 --- a/spacy/ru/tokenizer_exceptions.py +++ b/spacy/ru/tokenizer_exceptions.py @@ -26,4 +26,5 @@ TOKENIZER_EXCEPTIONS = { "Вс.": [ {ORTH: "Вс.", LEMMA: "Воскресенье"} ], -} \ No newline at end of file +} + From 8bd9b05fdc212e55b7714bb20594d8bb51657ba9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:13:36 +0200 Subject: [PATCH 104/110] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8a9ab517b..7cc47296c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -70,7 +70,7 @@ The [spaCy developer resources](https://github.com/explosion/spacy-dev-resources ### Contributor agreement -If you've made a substantial contribution to spaCy, you should fill in the [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that your contribution can be used across the project. If you agree to be bound by the terms of the agreement, fill in the [template]((.github/CONTRIBUTOR_AGREEMENT.md)) and include it with your pull request, or sumit it separately to [`.github/contributors/`](/.github/contributors). The name of the file should be your GitHub username, with the extension `.md`. For example, the user +If you've made a substantial contribution to spaCy, you should fill in the [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that your contribution can be used across the project. If you agree to be bound by the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md) and include it with your pull request, or sumit it separately to [`.github/contributors/`](/.github/contributors). The name of the file should be your GitHub username, with the extension `.md`. For example, the user example_user would create the file `.github/contributors/example_user.md`. From 5a4b5b362c27f1948187915e2349c35db8a5d64c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:29:10 +0200 Subject: [PATCH 105/110] Create shuvanon.md --- .github/contributors/shuvanon.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/shuvanon.md diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md new file mode 100644 index 000000000..c915d48bf --- /dev/null +++ b/.github/contributors/shuvanon.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Shuvanon Razik | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 3/12/2017 | +| GitHub username | shuvanon | +| Website (optional) | | From e787045cf55db5b68d878a291793b6e3786d6633 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 18 Oct 2017 14:31:57 +0200 Subject: [PATCH 106/110] Revert "filled up CONTRIBUTOR_AGREEMENT.md" This reverts commit 8a2d22222dec5cf910df5a378cbcd9ea2ab53ec4. --- .github/CONTRIBUTOR_AGREEMENT.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index c915d48bf..668b9dba2 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [x] I am signing on behalf of myself as an individual and no other person + * [ ] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions. @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Shuvanon Razik | +| Name | | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | 3/12/2017 | -| GitHub username | shuvanon | +| Date | | +| GitHub username | | | Website (optional) | | From 9162ecb43ff2883f271da2a7d5cab17615288ac3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:36:19 +0200 Subject: [PATCH 107/110] Update CONTRIBUTOR_AGREEMENT.md --- .github/CONTRIBUTOR_AGREEMENT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index 668b9dba2..f34603065 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -88,7 +88,7 @@ U.S. Federal law. Any choice of law rules will not apply. mark both statements: * [ ] I am signing on behalf of myself as an individual and no other person - or entity, including my employer, has or will have rights with respect my + or entity, including my employer, has or will have rights with respect to my contributions. * [ ] I am signing on behalf of my employer or a legal entity and I have the From 0b239ee6461a77f41d50aab64040b4f97f5949a5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:37:08 +0200 Subject: [PATCH 108/110] Create ines.md --- .github/contributors/ines.md | 106 +++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/ines.md diff --git a/.github/contributors/ines.md b/.github/contributors/ines.md new file mode 100644 index 000000000..5cd57b07e --- /dev/null +++ b/.github/contributors/ines.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ines Montani | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017/10/18 | +| GitHub username | ines | +| Website (optional) | https://explosion.ai | From 3357588b9fb6156cfcd48e3b9e556e413b5b9e27 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:41:31 +0200 Subject: [PATCH 109/110] Create honnibal.md --- .github/contributors/honnibal.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/honnibal.md diff --git a/.github/contributors/honnibal.md b/.github/contributors/honnibal.md new file mode 100644 index 000000000..3a700b7dd --- /dev/null +++ b/.github/contributors/honnibal.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Matthew Honnibal | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017-10-18 | +| GitHub username | honnibal | +| Website (optional) | https://explosion.ai | From e7b78370d99a59a80119ae1641b97ebbbb60088b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:41:38 +0200 Subject: [PATCH 110/110] Add note on origin of manually moved agreement See 8a2d22222dec5cf910df5a378cbcd9ea2ab53ec4 --- .github/contributors/shuvanon.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md index c915d48bf..82d02d8d2 100644 --- a/.github/contributors/shuvanon.md +++ b/.github/contributors/shuvanon.md @@ -1,3 +1,5 @@ + + # spaCy contributor agreement This spaCy Contributor Agreement (**"SCA"**) is based on the