From aafdf6ffb8c062a70a90a655bfd675bf12992ac7 Mon Sep 17 00:00:00 2001 From: Eric Zhao Date: Tue, 28 Mar 2017 23:35:03 -0700 Subject: [PATCH 01/27] Add option to use label karg to determine ent_type in doc.merge --- spacy/tests/spans/test_merge.py | 11 ++++++++++- spacy/tokens/doc.pyx | 7 +++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/spacy/tests/spans/test_merge.py b/spacy/tests/spans/test_merge.py index 86712f771..29cc917fe 100644 --- a/spacy/tests/spans/test_merge.py +++ b/spacy/tests/spans/test_merge.py @@ -19,6 +19,15 @@ def test_spans_merge_tokens(en_tokenizer): assert doc[0].text == 'Los Angeles' assert doc[0].head.text == 'start' + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + assert len(doc) == 4 + assert doc[0].head.text == 'Angeles' + assert doc[1].head.text == 'start' + doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', label='GPE') + assert len(doc) == 3 + assert doc[0].text == 'Los Angeles' + assert doc[0].head.text == 'start' + assert doc[0].ent_type_ == 'GPE' def test_spans_merge_heads(en_tokenizer): text = "I found a pilates class near work." @@ -114,4 +123,4 @@ def test_spans_subtree_size_check(en_tokenizer): sent1 = list(doc.sents)[0] init_len = len(list(sent1.root.subtree)) doc[0:2].merge('none', 'none', 'none') - assert len(list(sent1.root.subtree)) == init_len - 1 + assert len(list(sent1.root.subtree)) == init_len - 1 \ No newline at end of file diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 805a5b30c..15eaad33c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -667,6 +667,13 @@ cdef class Doc: attributes[TAG] = self.vocab.strings[tag] attributes[LEMMA] = self.vocab.strings[lemma] attributes[ENT_TYPE] = self.vocab.strings[ent_type] + elif not args: + if "label" in attributes and ENT_TYPE not in attributes: + if type(attributes["label"]) == int: + attributes[ENT_TYPE] = attributes["label"] + else: + attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]] + elif args: raise ValueError( "Doc.merge received %d non-keyword arguments. " From c2d48974bc49aa58719f878ae643cefe73f3dcf5 Mon Sep 17 00:00:00 2001 From: "Bruno P. Kinoshita" Date: Thu, 30 Mar 2017 21:59:18 +1300 Subject: [PATCH 02/27] Fix typos in Portuguese stop words --- spacy/pt/stop_words.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index 311b82477..1faa4a270 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals STOP_WORDS = set(""" -à às acerca adeus agora ainda algmas algo algumas alguns ali além ambos ano +à às acerca adeus agora ainda algo algumas alguns ali além ambos ano anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui aquilo area área as assim através atrás até aí @@ -27,7 +27,7 @@ geral grande grandes grupo hoje horas há -iniciar inicio ir irá isso ista iste isto já +iniciar inicio ir irá isso ista isto já lado ligado local logo longe lugar lá @@ -47,7 +47,9 @@ possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam pôde põe põem qual qualquer quando quanto quarta quarto quatro que quem quer quero questão -quieto quinta quinto quinze quê relação +quieto quinta quinto quinze quê + +relação sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo From 294718244f290e8b7602f8d35822e07136b5f78f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Mar 2017 05:21:13 -0500 Subject: [PATCH 03/27] Set tb=native in pytest, to try to fix travis flakiness --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 55c080d1d..acdf637d1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,9 +22,9 @@ install: script: - "pip install pytest" - - if [[ "${VIA}" == "compile" ]]; then python -m pytest spacy; fi - - if [[ "${VIA}" == "pypi" ]]; then python -m pytest `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi - - if [[ "${VIA}" == "sdist" ]]; then python -m pytest `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi + - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi + - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi + - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi notifications: slack: From 564daf6dec204fe2f842a5aae459ad08fbb4d8b3 Mon Sep 17 00:00:00 2001 From: Joshua Reeter Date: Thu, 30 Mar 2017 23:47:45 -0500 Subject: [PATCH 04/27] Issue #934 symlink should not convert paths as_posix under windows. --- spacy/cli/link.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 3777fd85f..82d1d9a33 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -46,7 +46,7 @@ def symlink(model_path, link_name, force): # Add workaround for Python 2 on Windows (see issue #909) if util.is_python2() and util.is_windows(): import subprocess - command = ['mklink', '/d', link_path.as_posix(), model_path.as_posix()] + command = ['mklink', '/d', link_path, model_path] subprocess.call(command, shell=True) else: link_path.symlink_to(model_path) From 8eafe8045000f7d64633961de1bedf2bf39d2d40 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 31 Mar 2017 09:12:31 +0200 Subject: [PATCH 05/27] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 2f9b75936..27d280785 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -7,6 +7,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary) * Bhargav Srinivasa, [@bhargavvader](https://github.com/bhargavvader) +* Bruno P. Kinoshita, [@kinow](https://github.com/kinow) * Chris DuBois, [@chrisdubois](https://github.com/chrisdubois) * Christoph Schwienheer, [@chssch](https://github.com/chssch) * Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk) @@ -14,6 +15,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi) * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28) * Greg Baker, [@solresol](https://github.com/solresol) +* Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard) * György Orosz, [@oroszgy](https://github.com/oroszgy) * Henning Peters, [@henningpeters](https://github.com/henningpeters) * Iddo Berger, [@iddoberger](https://github.com/iddoberger) @@ -21,6 +23,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) * Jordan Suchow, [@suchow](https://github.com/suchow) +* Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) * Kendrick Tan, [@kendricktan](https://github.com/kendricktan) * Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson) From 47a3ef06a69607c747ae4b7d0f34ec37635c6317 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 31 Mar 2017 12:30:14 +0200 Subject: [PATCH 06/27] Unhack deprojetivization, moving it into pipeline Previously the deprojectivize() call was attached to the transition system, and only called for German. Instead it should be a separate process, called after the parser. This makes it available for any language. Closes #898. --- spacy/language.py | 1 + spacy/syntax/arc_eager.pyx | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index a90e580ca..920a4c4c8 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -144,6 +144,7 @@ class BaseDefaults(object): pipeline.append(nlp.tagger) if nlp.parser: pipeline.append(nlp.parser) + pipeline.append(Pseudoprojectivity.deprojectivize) if nlp.entity: pipeline.append(nlp.entity) return pipeline diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index bea0c9b45..93bc21e22 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -427,8 +427,6 @@ cdef class ArcEager(TransitionSystem): def finalize_doc(self, doc): doc.is_parsed = True - if doc.vocab.lang == 'de': - PseudoProjectivity.deprojectivize(doc) cdef int set_valid(self, int* output, const StateC* st) nogil: cdef bint[N_MOVES] is_valid From 17a1e7a11961359839ff66785dc3fe0310d3a082 Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Fri, 31 Mar 2017 12:21:01 +0100 Subject: [PATCH 07/27] Add Portuguese numbers and ordinals --- spacy/pt/stop_words.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index 1faa4a270..30ac19c45 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -66,3 +66,22 @@ vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós zero """.split()) + + +# Number words + +NUM_WORDS = set(""" +zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze +quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta +sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião +""".split()) + +# Ordinal words + +ORDINAL_WORDS = set(""" +primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo +vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo +octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo +quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo +milésimo milionésimo bilionésimo +""".split()) From c1d020b0a68630bbc9950b09ad72ba38ee1c749f Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Fri, 31 Mar 2017 12:26:13 +0100 Subject: [PATCH 08/27] Remove "ista" from portuguese stop words --- spacy/pt/stop_words.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index 30ac19c45..7c21f760b 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -27,7 +27,7 @@ geral grande grandes grupo hoje horas há -iniciar inicio ir irá isso ista isto já +iniciar inicio ir irá isso isto já lado ligado local logo longe lugar lá From e854f28304bfea7d7b80e2f52d8a514418a53553 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 31 Mar 2017 13:26:25 +0200 Subject: [PATCH 09/27] Add test for Issue #758 Issue #758 occurs when no actions are available for a single token doc after merging. --- spacy/tests/regression/test_issue758.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 spacy/tests/regression/test_issue758.py diff --git a/spacy/tests/regression/test_issue758.py b/spacy/tests/regression/test_issue758.py new file mode 100644 index 000000000..0dac5737b --- /dev/null +++ b/spacy/tests/regression/test_issue758.py @@ -0,0 +1,17 @@ +from ... import load as load_spacy +from ...attrs import LEMMA +from ...matcher import merge_phrase + +import pytest + + + + +@pytest.mark.models +def test_issue758(): + '''Test parser transition bug after label added.''' + nlp = load_spacy('en') + nlp.matcher.add('splash', 'my_entity', {}, + [[{LEMMA: 'splash'}, {LEMMA: 'on'}]], + on_match=merge_phrase) + doc = nlp('splash On') From 725249c59a920d0702c42dc206adc2cfd3b4e049 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 31 Mar 2017 13:58:59 +0200 Subject: [PATCH 10/27] Add merge_phrase callback in matcher.pyx --- spacy/matcher.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 1883ae89a..c5e520656 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -161,6 +161,13 @@ def _convert_strings(token_specs, string_store): return tokens +def merge_phrase(matcher, doc, i, matches): + '''Callback to merge a phrase on match''' + ent_id, label, start, end = matches[i] + span = doc[start : end] + span.merge(ent_type=label, ent_id=ent_id) + + cdef class Matcher: '''Match sequences of tokens, based on pattern rules.''' cdef Pool mem From 1bb7b4ca714e318ca8de206dca74940cb0114d33 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 31 Mar 2017 13:59:19 +0200 Subject: [PATCH 11/27] Add comment --- spacy/syntax/parser.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 764efea8b..123ae03da 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -257,6 +257,7 @@ cdef class Parser: cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: state = new StateC(tokens, length) # NB: This can change self.moves.n_moves! + # I think this causes memory errors if called by .pipe() self.moves.initialize_state(state) nr_class = self.moves.n_moves From cfff4e0f610a50156b0ea62f392a5b4c0761846a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 31 Mar 2017 13:59:32 +0200 Subject: [PATCH 12/27] Improve test --- spacy/tests/regression/test_issue758.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue758.py b/spacy/tests/regression/test_issue758.py index 0dac5737b..2ddba9975 100644 --- a/spacy/tests/regression/test_issue758.py +++ b/spacy/tests/regression/test_issue758.py @@ -14,4 +14,4 @@ def test_issue758(): nlp.matcher.add('splash', 'my_entity', {}, [[{LEMMA: 'splash'}, {LEMMA: 'on'}]], on_match=merge_phrase) - doc = nlp('splash On') + doc = nlp('splash On', parse=False) From 9720103428b1b77b1ecd2d7e328827f935dc0f34 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 31 Mar 2017 13:59:58 +0200 Subject: [PATCH 13/27] Improve attribute handlign in doc.merge(). Still unsatisfying --- spacy/tokens/doc.pyx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 348d84012..ad5141a08 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -668,12 +668,15 @@ cdef class Doc: attributes[LEMMA] = self.vocab.strings[lemma] attributes[ENT_TYPE] = self.vocab.strings[ent_type] elif not args: - if "label" in attributes and ENT_TYPE not in attributes: + # TODO: This code makes little sense overall. We're still + # ignoring most of the attributes? + if "label" in attributes and 'ent_type' not in attributes: if type(attributes["label"]) == int: attributes[ENT_TYPE] = attributes["label"] else: attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]] - + if 'ent_type' in attributes: + attributes[ENT_TYPE] = attributes['ent_type'] elif args: raise ValueError( "Doc.merge received %d non-keyword arguments. " @@ -693,6 +696,9 @@ cdef class Doc: tag = self.vocab.strings[attributes.get(TAG, span.root.tag)] lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)] ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)] + ent_id = attributes.get('ent_id', span.root.ent_id) + if not isinstance(ent_id, int): + ent_id = self.vocab.strings[ent_id] # Get LexemeC for newly merged token new_orth = ''.join([t.text_with_ws for t in span]) @@ -713,6 +719,7 @@ cdef class Doc: else: token.ent_iob = 3 token.ent_type = self.vocab.strings[ent_type] + token.ent_id = ent_id # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a dependency From fc3900e5b2e7d5ab311146e8c9e1b0876ef93c06 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 31 Mar 2017 14:00:14 +0200 Subject: [PATCH 14/27] Allow ent_id to be set in Token --- spacy/tokens/token.pyx | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 34de9dee7..ccada6b16 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -506,21 +506,15 @@ cdef class Token: return self.c.ent_id def __set__(self, hash_t key): - # TODO - raise NotImplementedError( - "Can't yet set ent_id from Token. Vote for this feature on the issue " - "tracker: http://github.com/spacy-io/spaCy") + self.c.ent_id = key property ent_id_: '''A (string) entity ID. Usually assigned by patterns in the Matcher.''' def __get__(self): return self.vocab.strings[self.c.ent_id] - def __set__(self, hash_t key): - # TODO - raise NotImplementedError( - "Can't yet set ent_id_ from Token. Vote for this feature on the issue " - "tracker: http://github.com/spacy-io/spaCy") + def __set__(self, name): + self.c.ent_id = self.vocab.strings[name] property whitespace_: def __get__(self): From 465b240bcb0ccfa445571830eaee923bfa2638cb Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Fri, 31 Mar 2017 13:00:29 +0100 Subject: [PATCH 15/27] Review Portuguese stop words Mainly to review typos and add missing masculines/feminines --- spacy/pt/stop_words.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index 7c21f760b..a24356881 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -3,18 +3,19 @@ from __future__ import unicode_literals STOP_WORDS = set(""" -à às acerca adeus agora ainda algo algumas alguns ali além ambos ano -anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui -aquilo area área as assim através atrás até aí +à às acerca adeus agora ainda algo algumas alguns ali além ambas ambos ano +anos antes ao aos apenas apoio apoia apontar após aquela aquelas aquele aqueles +aqui aquilo área as assim através atrás até aí -baixo bastante bem bom breve +baixo bastante bem boa bom breve cada caminho catorze cedo cento certamente certeza cima cinco coisa com como -comprido conhecido conselho contra corrente custa cá +comprido comprida conhecida conhecido conselho contra corrente custa cá -da daquela daquele dar das de debaixo demais dentro depois desde desligado -dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete -dezoito dia diante direita diz dizem dizer do dois dos doze duas dá dão dúvida +da daquela daquele dar das de debaixo demais dentro depois desde desligada +desligado dessa desse desta deste deve devem deverá dez dezanove dezasseis +dezassete dezoito dia diante direita diz dizem dizer do dois dos doze duas dá +dão dúvida é ela elas ele eles em embora enquanto entre então era és essa essas esse esses esta estado estar estará estas estava este estes esteve estive estivemos @@ -35,34 +36,34 @@ maior maioria maiorias mais mal mas me meio menor menos meses mesmo meu meus mil minha minhas momento muito muitos máximo mês na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no noite nome -nos nossa nossas nosso nossos nova nove novo novos num numa nunca não nível nós -número +nos nossa nossas nosso nossos nova novas nove novo novos num numa nunca nuns +não nível nós número números obra obrigada obrigado oitava oitavo oito onde ontem onze os ou outra outras outro outros para parece parte partir pegar pela pelas pelo pelos perto pessoas pode podem poder poderá podia ponto pontos por porque porquê posição possivelmente posso -possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam -pôde põe põem +possível pouca pouco povo primeira primeiro próprio próxima próximo puderam pôde +põe põem -qual qualquer quando quanto quarta quarto quatro que quem quer quero questão -quieto quinta quinto quinze quê +qual qualquer quando quanto quarta quarto quatro que quem quer querem quero +questão quieta quieto quinta quinto quinze quê relação sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo -tal talvez também tanto tarde te tem temos tempo tendes tenho tens tentar +tal talvez também tanta tanto tarde te tem temos tempo tendes tenho tens tentar tentaram tente tentei ter terceira terceiro teu teus teve tipo tive tivemos tiveram tiveste tivestes toda todas todo todos trabalhar trabalho treze três tu tua tuas tudo tão têm último um uma umas uns usa usar -vai vais valor veja vem vens ver verdade verdadeiro vez vezes viagem vindo -vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós +vai vais valor veja vem vens ver verdade verdadeira verdadeiro vez vezes viagem +vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós zero """.split()) From 4fde64c4eac68dd741c9abeb6bca4fb25698288a Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Fri, 31 Mar 2017 15:52:55 +0100 Subject: [PATCH 16/27] Portuguese contractions and some abreviations --- spacy/pt/tokenizer_exceptions.py | 111 +++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 spacy/pt/tokenizer_exceptions.py diff --git a/spacy/pt/tokenizer_exceptions.py b/spacy/pt/tokenizer_exceptions.py new file mode 100644 index 000000000..1e02f6c6e --- /dev/null +++ b/spacy/pt/tokenizer_exceptions.py @@ -0,0 +1,111 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + +TOKENIZER_EXCEPTIONS = {} + +# Contractions +CONTRACTIONS = {} + +personal_pronoun = ( + "ele", "ela", "eles", "elas" +) +demonstrative_pronouns = ( + "este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas", + "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo" +) +undefined_pronouns = ( + "outro", "outra", "outros", "outras" +) +adverbs = ( + "aqui", "aí", "ali", "além" +) + +for word in personal_pronoun + demonstrative_pronouns + \ + undefined_pronouns + adverbs: + CONTRACTIONS["d" + word] = [ + {ORTH: "d", NORM: "de"}, + {ORTH: word} + ] + +for word in personal_pronoun + demonstrative_pronouns + \ + undefined_pronouns: + CONTRACTIONS["n" + word] = [ + {ORTH: "n", NORM: "em"}, + {ORTH: word} + ] + +# Not so linear contractions "a"+something + +CONTRACTIONS.update({ + # This one cannot be split into 2 + # "à": [ + # {ORTH: "à", NORM: "a"}, + # {ORTH: "", NORM: "a"} + # ], + "às": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "s", NORM: "as"} + ], + "ao": [ + {ORTH: "a"}, + {ORTH: "o"} + ], + "aos": [ + {ORTH: "a"}, + {ORTH: "os"} + ], + "àquele": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quele", NORM: "aquele"} + ], + "àquela": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quela", NORM: "aquela"} + ], + "àqueles": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "queles", NORM: "aqueles"} + ], + "àquelas": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quelas", NORM: "aquelas"} + ], + "àquilo": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quilo", NORM: "aquilo"} + ], + "aonde": [ + {ORTH: "a"}, + {ORTH: "onde"} + ], +}) + +TOKENIZER_EXCEPTIONS.update(CONTRACTIONS) + +# Abbreviations with only one ORTH token + +ORTH_ONLY = [ + "Adm.", + "Dr.", + "e.g.", + "E.g.", + "E.G.", + "Gen.", + "Gov.", + "i.e.", + "I.e.", + "I.E.", + "Jr.", + "Ltd.", + "p.m.", + "Ph.D.", + "Rep.", + "Rev.", + "Sen.", + "Sr.", + "Sra.", + "vs.", +] From 51882ee2b8954f71ddd29ca8bd4cf7b0835b0d41 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 31 Mar 2017 19:32:01 +0200 Subject: [PATCH 17/27] Fix check for setting ent_id in merge --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ad5141a08..bffc8bbdf 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -697,7 +697,7 @@ cdef class Doc: lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)] ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)] ent_id = attributes.get('ent_id', span.root.ent_id) - if not isinstance(ent_id, int): + if isinstance(ent_id, basestring): ent_id = self.vocab.strings[ent_id] # Get LexemeC for newly merged token From e71a1f4bd04eba953323613d8b0581fdce42fa74 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 1 Apr 2017 10:19:32 +0200 Subject: [PATCH 18/27] Fix download commands in error messages (see #946) --- spacy/lexeme.pyx | 2 +- spacy/tokens/doc.pyx | 4 ++-- spacy/tokens/span.pyx | 2 +- spacy/tokens/token.pyx | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1d5421d74..38789bfe9 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -134,7 +134,7 @@ cdef class Lexeme: raise ValueError( "Word vectors set to length 0. This may be because the " "data is not installed. If you haven't already, run" - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data." % self.vocab.lang ) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index bffc8bbdf..d59317747 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -394,7 +394,7 @@ cdef class Doc: raise ValueError( "noun_chunks requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. @@ -427,7 +427,7 @@ cdef class Doc: raise ValueError( "sentence boundary detection requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) cdef int i start = 0 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index fc5d26174..ee8a6af7f 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -210,7 +210,7 @@ cdef class Span: raise ValueError( "noun_chunks requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index ccada6b16..a89b35eee 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -236,7 +236,7 @@ cdef class Token: raise ValueError( "Word vectors set to length 0. This may be because the " "data is not installed. If you haven't already, run" - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data." % self.vocab.lang ) vector_view = self.c.lex.vector From 3b667a24d404a48297777ea5ca98227fdd358d69 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 1 Apr 2017 10:19:01 +0200 Subject: [PATCH 19/27] Remove whitespace --- spacy/lexeme.pyx | 30 +++++++++++++++--------------- spacy/tokens/span.pyx | 26 +++++++++++++------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 38789bfe9..3a26161bb 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -87,7 +87,7 @@ cdef class Lexeme: value (bool): The new value of the flag. """ Lexeme.c_set_flag(self.c, flag_id, value) - + def check_flag(self, attr_id_t flag_id): """Check the value of a boolean flag. @@ -137,7 +137,7 @@ cdef class Lexeme: "\npython -m spacy download %s\n" "to install the data." % self.vocab.lang ) - + vector_view = self.c.vector return numpy.asarray(vector_view) @@ -163,7 +163,7 @@ cdef class Lexeme: return self.c.sentiment def __set__(self, float sentiment): self.c.sentiment = sentiment - + property orth_: def __get__(self): return self.vocab.strings[self.c.orth] @@ -171,7 +171,7 @@ cdef class Lexeme: property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x - + property norm: def __get__(self): return self.c.norm def __set__(self, int x): self.c.norm = x @@ -187,11 +187,11 @@ cdef class Lexeme: property suffix: def __get__(self): return self.c.suffix def __set__(self, int x): self.c.suffix = x - + property cluster: def __get__(self): return self.c.cluster def __set__(self, int x): self.c.cluster = x - + property lang: def __get__(self): return self.c.lang def __set__(self, int x): self.c.lang = x @@ -203,11 +203,11 @@ cdef class Lexeme: property lower_: def __get__(self): return self.vocab.strings[self.c.lower] def __set__(self, unicode x): self.c.lower = self.vocab.strings[x] - + property norm_: def __get__(self): return self.vocab.strings[self.c.norm] def __set__(self, unicode x): self.c.norm = self.vocab.strings[x] - + property shape_: def __get__(self): return self.vocab.strings[self.c.shape] def __set__(self, unicode x): self.c.shape = self.vocab.strings[x] @@ -239,7 +239,7 @@ cdef class Lexeme: property is_alpha: def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x) - + property is_ascii: def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x) @@ -260,23 +260,23 @@ cdef class Lexeme: def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x) - property is_space: + property is_space: def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) - property is_bracket: + property is_bracket: def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) - property is_quote: + property is_quote: def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) - property is_left_punct: + property is_left_punct: def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) - property is_right_punct: + property is_right_punct: def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) @@ -284,7 +284,7 @@ cdef class Lexeme: property like_url: def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) - + property like_num: def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM) def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index ee8a6af7f..f43d47876 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -128,13 +128,13 @@ cdef class Span: end = token_by_end(self.doc.c, self.doc.length, self.end_char) if end == -1: raise IndexError("Error calculating span: Can't find end") - + self.start = start self.end = end + 1 property sent: '''The sentence span that this span is a part of. - + Returns: Span The sentence this is part of. ''' @@ -157,7 +157,7 @@ cdef class Span: if 'has_vector' in self.doc.user_span_hooks: return self.doc.user_span_hooks['has_vector'](self) return any(token.has_vector for token in self) - + property vector: def __get__(self): if 'vector' in self.doc.user_span_hooks: @@ -200,9 +200,9 @@ cdef class Span: property noun_chunks: ''' Yields base noun-phrase #[code Span] objects, if the document - has been syntactically parsed. A base noun phrase, or - 'NP chunk', is a noun phrase that does not permit other NPs to - be nested within it – so no NP-level coordination, no prepositional + has been syntactically parsed. A base noun phrase, or + 'NP chunk', is a noun phrase that does not permit other NPs to + be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example: ''' def __get__(self): @@ -227,19 +227,19 @@ cdef class Span: Returns: Token: The root token. - + i.e. has the shortest path to the root of the sentence (or is the root itself). If multiple words are equally high in the tree, the first word is taken. - + For example: - + >>> toks = nlp(u'I like New York in Autumn.') Let's name the indices --- easier than writing "toks[4]" etc. - >>> i, like, new, york, in_, autumn, dot = range(len(toks)) + >>> i, like, new, york, in_, autumn, dot = range(len(toks)) The head of 'new' is 'York', and the head of 'York' is 'like' @@ -301,10 +301,10 @@ cdef class Span: return self.doc[self.start] else: return self.doc[root] - + property lefts: """Tokens that are to the left of the span, whose head is within the Span. - + Yields: Token A left-child of a token of the span. """ def __get__(self): @@ -315,7 +315,7 @@ cdef class Span: property rights: """Tokens that are to the right of the Span, whose head is within the Span. - + Yields: Token A right-child of a token of the span. """ def __get__(self): From ad8bf1829f72e0bf769081c7f9a8008a82df7504 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 1 Apr 2017 10:37:42 +0200 Subject: [PATCH 20/27] Import and combine Portuguese tokenizer exceptions (see #943) --- spacy/pt/language_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index f9899d8d1..d96cdd38f 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -5,13 +5,15 @@ from .. import language_data as base from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS - +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY STOP_WORDS = set(STOP_WORDS) -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] From 2de2195be887acb25e9a364293842e87ea1ee9ba Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 1 Apr 2017 10:39:42 +0200 Subject: [PATCH 21/27] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 27d280785..53807208c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -33,6 +33,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Matthew Honnibal, [@honnibal](https://github.com/honnibal) * Maxim Samsonov, [@maxirmx](https://github.com/maxirmx) * Michael Wallin, [@wallinm1](https://github.com/wallinm1) +* Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) * Pokey Rule, [@pokey](https://github.com/pokey) * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) From 2c36a61ec531c570d51e2afbe47bce0f5364cfd6 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 3 Apr 2017 18:12:38 +0200 Subject: [PATCH 22/27] Add spacyr to libraries --- website/docs/usage/_data.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index c219585f9..c8c85af1d 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -151,6 +151,11 @@ "url": "https://github.com/golastmile/rasa_nlu", "author": "LASTMILE", "description": "High level APIs for building your own language parser using existing NLP and ML libraries." + }, + "spacyr": { + "url": "https://github.com/kbenoit/spacyr", + "author": "Kenneth Benoit", + "description": "An R wrapper for spaCy." } }, "visualizations": { From 808cd6cf7f184e20d9b8e42364f7e10f045028dc Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 3 Apr 2017 18:12:52 +0200 Subject: [PATCH 23/27] Add missing tags to verbs (resolves #948) --- spacy/en/tokenizer_exceptions.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 07b01c4fb..3d009241b 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -213,15 +213,15 @@ for verb_data in [ {ORTH: "does", LEMMA: "do"}, {ORTH: "did", LEMMA: "do", TAG: "VBD"}, {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "may"}, - {ORTH: "might"}, - {ORTH: "must"}, + {ORTH: "may", TAG: "MD"}, + {ORTH: "might", TAG: "MD"}, + {ORTH: "must", TAG: "MD"}, {ORTH: "need"}, {ORTH: "ought"}, - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "should"}, - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "would"} + {ORTH: "sha", LEMMA: "shall", TAG: "MD"}, + {ORTH: "should", TAG: "MD"}, + {ORTH: "wo", LEMMA: "will", TAG: "MD"}, + {ORTH: "would", TAG: "MD"} ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() From 010293fb2f0ce0841f3eb4ebe7e53e4d0ec0133a Mon Sep 17 00:00:00 2001 From: oeg Date: Thu, 6 Apr 2017 17:33:15 +0200 Subject: [PATCH 24/27] fix(typo): Fixes typo in method calling PseudoProjectivity.deprojectivize, failing with new train cli --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 920a4c4c8..25bfb9e08 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -144,7 +144,7 @@ class BaseDefaults(object): pipeline.append(nlp.tagger) if nlp.parser: pipeline.append(nlp.parser) - pipeline.append(Pseudoprojectivity.deprojectivize) + pipeline.append(PseudoProjectivity.deprojectivize) if nlp.entity: pipeline.append(nlp.entity) return pipeline From 75f9b4c6e288f93513c85617a79fd6b5bd84ac2a Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 7 Apr 2017 10:21:29 +0200 Subject: [PATCH 25/27] Fix whitespace --- spacy/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 6c25ce0e8..2d9812839 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,7 +7,6 @@ import re import os.path import pathlib import sys - import textwrap From d6bbc3ffcdb4e7e3ba88abf41798c534fab449b1 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 7 Apr 2017 10:21:43 +0200 Subject: [PATCH 26/27] Fix formatting --- website/docs/usage/troubleshooting.jade | 1 - 1 file changed, 1 deletion(-) diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade index 06454b055..8af611859 100644 --- a/website/docs/usage/troubleshooting.jade +++ b/website/docs/usage/troubleshooting.jade @@ -33,7 +33,6 @@ p | import the language's #[code Language] class instead, for example | #[code from spacy.fr import French]. - +h(3, "symlink-privilege") Symbolic link privilege not held +code(false, "text"). From f33c4cbae11354133acceb2be655d946b7c3b5f8 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 7 Apr 2017 10:22:06 +0200 Subject: [PATCH 27/27] Add --no-cache-dir error to troubleshooting docs (see #958) --- website/docs/usage/troubleshooting.jade | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade index 8af611859..cb8271343 100644 --- a/website/docs/usage/troubleshooting.jade +++ b/website/docs/usage/troubleshooting.jade @@ -50,6 +50,20 @@ p | or use a #[code virtualenv] to install spaCy in a user directory, instead | of doing a system-wide installation. ++h(3, "no-cache-dir") No such option: --no-cache-dir + ++code(false, "text"). + no such option: --no-cache-dir + +p + | The #[code download] command uses pip to install the models and sets the + | #[code --no-cache-dir] flag to prevent it from requiring too much memory. + | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] + | requires pip v6.0 or newer. + ++infobox("Solution") + | Run #[code pip install -U pip] to upgrade to the latest version of pip. + | To see which version you have installed, run #[code pip --version]. +h(3, "import-error") Import error