Merge branch 'master' of https://github.com/explosion/spaCy

2025-07-05 20:33:10 +03:00 · 2017-04-07 15:50:14 +02:00 · 2017-04-07 15:50:14 +02:00 · be204ed714
commit be204ed714
parent e7b1ee9efd a5538d93d0
16 changed files with 333 additions and 36 deletions
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -12,6 +12,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Christoph Schwienheer, [@chssch](https://github.com/chssch)
 * Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk)
 * Daniel Rapp, [@rappdw](https://github.com/rappdw)
+* Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo)
 * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
 * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28)
 * Greg Baker, [@solresol](https://github.com/solresol)
@ -33,6 +34,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Matthew Honnibal, [@honnibal](https://github.com/honnibal)
 * Maxim Samsonov, [@maxirmx](https://github.com/maxirmx)
 * Michael Wallin, [@wallinm1](https://github.com/wallinm1)
+* Miguel Almeida, [@mamoit](https://github.com/mamoit)
 * Oleg Zd, [@olegzd](https://github.com/olegzd)
 * Pokey Rule, [@pokey](https://github.com/pokey)
 * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202)
--- a/examples/keras_parikh_entailment/README.md
+++ b/examples/keras_parikh_entailment/README.md
@ -78,7 +78,7 @@ You can run the `keras_parikh_entailment/` directory as a script, which executes
 [`keras_parikh_entailment/__main__.py`](__main__.py). The first thing you'll want to do is train the model:

 ```bash
-python keras_parikh_entailment/ train <your_model_dir> <train_directory> <dev_directory>
+python keras_parikh_entailment/ train <train_directory> <dev_directory>
 ```

 Training takes about 300 epochs for full accuracy, and I haven't rerun the full
--- a/examples/keras_parikh_entailment/main.py
+++ b/examples/keras_parikh_entailment/main.py
@ -52,7 +52,7 @@ def train(train_loc, dev_loc, shape, settings):
        file_.write(model.to_json())


-def evaluate(model_dir, dev_loc):
+def evaluate(dev_loc):
    dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
    nlp = spacy.load('en',
            create_pipeline=create_similarity_pipeline)
--- a/examples/keras_parikh_entailment/spacy_hook.py
+++ b/examples/keras_parikh_entailment/spacy_hook.py
@ -80,10 +80,10 @@ def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr
    return Xs


-def create_similarity_pipeline(nlp):
+def create_similarity_pipeline(nlp, max_length=100):
    return [
        nlp.tagger,
        nlp.entity,
        nlp.parser,
-        KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length=10)
+        KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length)
    ]
--- a/spacy/main.py
+++ b/spacy/main.py
@ -9,12 +9,13 @@ from spacy.cli import link as cli_link
 from spacy.cli import info as cli_info
 from spacy.cli import package as cli_package
 from spacy.cli import train as cli_train
+from spacy.cli import model as cli_model


 class CLI(object):
    """Command-line interface for spaCy"""

-    commands = ('download', 'link', 'info', 'package', 'train')
+    commands = ('download', 'link', 'info', 'package', 'train', 'model')

    @plac.annotations(
        model=("model to download (shortcut or model name)", "positional", None, str),
@ -95,6 +96,20 @@ class CLI(object):
        cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
                  not no_parser, not no_ner, parser_L1)

+    @plac.annotations(
+        lang=("model language", "positional", None, str),
+        model_dir=("output directory to store model in", "positional", None, str),
+        freqs_data=("tab-separated frequencies file", "positional", None, str),
+        clusters_data=("Brown clusters file", "positional", None, str),
+        vectors_data=("word vectors file", "positional", None, str)
+    )
+    def model(self, lang, model_dir, freqs_data, clusters_data=None, vectors_data=None):
+        """
+        Initialize a new model and its data directory.
+        """
+
+        cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
+

    def __missing__(self, name):
        print("\n   Command %r does not exist."
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -3,3 +3,4 @@ from .info import info
 from .link import link
 from .package import package
 from .train import train, train_config
+from .model import model
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@ -0,0 +1,129 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import gzip
+import math
+from ast import literal_eval
+from pathlib import Path
+from preshed.counter import PreshCounter
+
+from ..vocab import write_binary_vectors
+from .. import util
+
+
+def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
+    model_path = Path(model_dir)
+    freqs_path = Path(freqs_data)
+    clusters_path = Path(clusters_data) if clusters_data else None
+    vectors_path = Path(vectors_data) if vectors_data else None
+
+    check_dirs(freqs_path, clusters_path, vectors_path)
+    vocab = util.get_lang_class(lang).Defaults.create_vocab()
+    probs, oov_prob = read_probs(freqs_path)
+    clusters = read_clusters(clusters_path) if clusters_path else {}
+    populate_vocab(vocab, clusters, probs, oov_prob)
+    create_model(model_path, vectors_path, vocab, oov_prob)
+
+
+def create_model(model_path, vectors_path, vocab, oov_prob):
+    vocab_path = model_path / 'vocab'
+    lexemes_path = vocab_path / 'lexemes.bin'
+    strings_path = vocab_path / 'strings.json'
+    oov_path = vocab_path / 'oov_prob'
+
+    if not model_path.exists():
+        model_path.mkdir()
+    if not vocab_path.exists():
+        vocab_path.mkdir()
+    vocab.dump(lexemes_path.as_posix())
+    with strings_path.open('w') as f:
+        vocab.strings.dump(f)
+    with oov_path.open('w') as f:
+        f.write('%f' % oov_prob)
+    if vectors_path:
+        vectors_dest = model_path / 'vec.bin'
+        write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix())
+
+
+def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
+    counts = PreshCounter()
+    total = 0
+    freqs_file = check_unzip(freqs_path)
+    for i, line in enumerate(freqs_file):
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
+        freq = int(freq)
+        counts.inc(i+1, freq)
+        total += freq
+    counts.smooth()
+    log_total = math.log(total)
+    freqs_file = check_unzip(freqs_path)
+    probs = {}
+    for line in freqs_file:
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
+        doc_freq = int(doc_freq)
+        freq = int(freq)
+        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
+            word = literal_eval(key)
+            smooth_count = counts.smoother(int(freq))
+            probs[word] = math.log(smooth_count) - log_total
+    oov_prob = math.log(counts.smoother(0)) - log_total
+    return probs, oov_prob
+
+
+def read_clusters(clusters_path):
+    clusters = {}
+    with clusters_path.open() as f:
+        for line in f:
+            try:
+                cluster, word, freq = line.split()
+            except ValueError:
+                continue
+            # If the clusterer has only seen the word a few times, its
+            # cluster is unreliable.
+            if int(freq) >= 3:
+                clusters[word] = cluster
+            else:
+                clusters[word] = '0'
+    # Expand clusters with re-casing
+    for word, cluster in list(clusters.items()):
+        if word.lower() not in clusters:
+            clusters[word.lower()] = cluster
+        if word.title() not in clusters:
+            clusters[word.title()] = cluster
+        if word.upper() not in clusters:
+            clusters[word.upper()] = cluster
+    return clusters
+
+
+def populate_vocab(vocab, clusters, probs, oov_probs):
+    # Ensure probs has entries for all words seen during clustering.
+    for word in clusters:
+        if word not in probs:
+            probs[word] = oov_prob
+    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
+        lexeme = vocab[word]
+        lexeme.prob = prob
+        lexeme.is_oov = False
+        # Decode as a little-endian string, so that we can do & 15 to get
+        # the first 4 bits. See _parse_features.pyx
+        if word in clusters:
+            lexeme.cluster = int(clusters[word][::-1], 2)
+        else:
+            lexeme.cluster = 0
+
+
+def check_unzip(file_path):
+    file_path_str = file_path.as_posix()
+    if file_path_str.endswith('gz'):
+        return gzip.open(file_path_str)
+    else:
+        return file_path.open()
+
+
+def check_dirs(freqs_data, clusters_data, vectors_data):
+    if not freqs_data.is_file():
+        util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
+    if clusters_data and not clusters_data.is_file():
+        util.sys_exit(clusters_data.as_posix(), title="No Brown clusters file found")
+    if vectors_data and not vectors_data.is_file():
+        util.sys_exit(vectors_data.as_posix(), title="No word vectors file found")
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@ -213,15 +213,15 @@ for verb_data in [
    {ORTH: "does", LEMMA: "do"},
    {ORTH: "did", LEMMA: "do", TAG: "VBD"},
    {ORTH: "had", LEMMA: "have", TAG: "VBD"},
-    {ORTH: "may"},
-    {ORTH: "might"},
-    {ORTH: "must"},
+    {ORTH: "may", TAG: "MD"},
+    {ORTH: "might", TAG: "MD"},
+    {ORTH: "must", TAG: "MD"},
    {ORTH: "need"},
    {ORTH: "ought"},
-    {ORTH: "sha", LEMMA: "shall"},
-    {ORTH: "should"},
-    {ORTH: "wo", LEMMA: "will"},
-    {ORTH: "would"}
+    {ORTH: "sha", LEMMA: "shall", TAG: "MD"},
+    {ORTH: "should", TAG: "MD"},
+    {ORTH: "wo", LEMMA: "will", TAG: "MD"},
+    {ORTH: "would", TAG: "MD"}
 ]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
--- a/spacy/language.py
+++ b/spacy/language.py
@ -144,7 +144,7 @@ class BaseDefaults(object):
            pipeline.append(nlp.tagger)
        if nlp.parser:
            pipeline.append(nlp.parser)
-            pipeline.append(Pseudoprojectivity.deprojectivize)
+            pipeline.append(PseudoProjectivity.deprojectivize)
        if nlp.entity:
            pipeline.append(nlp.entity)
        return pipeline
--- a/spacy/pt/language_data.py
+++ b/spacy/pt/language_data.py
@ -5,13 +5,15 @@ from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc

 from .stop_words import STOP_WORDS
-
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY

 STOP_WORDS = set(STOP_WORDS)


-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))


 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/pt/stop_words.py
+++ b/spacy/pt/stop_words.py
@ -3,18 +3,19 @@ from __future__ import unicode_literals


 STOP_WORDS = set("""
-à às acerca adeus agora ainda algo algumas alguns ali além ambos ano
-anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui
-aquilo area área as assim através atrás até aí
+à às acerca adeus agora ainda algo algumas alguns ali além ambas ambos ano
+anos antes ao aos apenas apoio apoia apontar após aquela aquelas aquele aqueles
+aqui aquilo área as assim através atrás até aí

-baixo bastante bem bom breve
+baixo bastante bem boa bom breve

 cada caminho catorze cedo cento certamente certeza cima cinco coisa com como
-comprido conhecido conselho contra corrente custa cá
+comprido comprida conhecida conhecido conselho contra corrente custa cá

-da daquela daquele dar das de debaixo demais dentro depois desde desligado
-dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete
-dezoito dia diante direita diz dizem dizer do dois dos doze duas dá dão dúvida
+da daquela daquele dar das de debaixo demais dentro depois desde desligada
+desligado dessa desse desta deste deve devem deverá dez dezanove dezasseis
+dezassete dezoito dia diante direita diz dizem dizer do dois dos doze duas dá
+dão dúvida

 é ela elas ele eles em embora enquanto entre então era és essa essas esse esses
 esta estado estar estará estas estava este estes esteve estive estivemos
@ -27,7 +28,7 @@ geral grande grandes grupo

 hoje horas há

-iniciar inicio ir irá isso ista isto já
+iniciar inicio ir irá isso isto já

 lado ligado local logo longe lugar lá

@ -35,34 +36,53 @@ maior maioria maiorias mais mal mas me meio menor menos meses mesmo meu meus
 mil minha minhas momento muito muitos máximo mês

 na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no noite nome
-nos nossa nossas nosso nossos nova nove novo novos num numa nunca não nível nós
-número
+nos nossa nossas nosso nossos nova novas nove novo novos num numa nunca nuns
+não nível nós número números

 obra obrigada obrigado oitava oitavo oito onde ontem onze os ou outra outras
 outro outros

 para parece parte partir pegar pela pelas pelo pelos perto pessoas pode podem
 poder poderá podia ponto pontos por porque porquê posição possivelmente posso
-possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam
-pôde põe põem
+possível pouca pouco povo primeira primeiro próprio próxima próximo puderam pôde
+põe põem

-qual qualquer quando quanto quarta quarto quatro que quem quer quero questão
-quieto quinta quinto quinze quê
+qual qualquer quando quanto quarta quarto quatro que quem quer querem quero
+questão quieta quieto quinta quinto quinze quê

 relação

 sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta
 sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo

-tal talvez também tanto tarde te tem temos tempo tendes tenho tens tentar
+tal talvez também tanta tanto tarde te tem temos tempo tendes tenho tens tentar
 tentaram tente tentei ter terceira terceiro teu teus teve tipo tive tivemos
 tiveram tiveste tivestes toda todas todo todos trabalhar trabalho treze três tu
 tua tuas tudo tão têm

 último um uma umas uns usa usar

-vai vais valor veja vem vens ver verdade verdadeiro vez vezes viagem vindo
-vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós
+vai vais valor veja vem vens ver verdade verdadeira verdadeiro vez vezes viagem
+vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós

 zero
 """.split())
+
+
+# Number words
+
+NUM_WORDS = set("""
+zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
+quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
+sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
+""".split())
+
+# Ordinal words
+
+ORDINAL_WORDS = set("""
+primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
+vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
+octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
+quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
+milésimo milionésimo bilionésimo
+""".split())
--- a/spacy/pt/tokenizer_exceptions.py
+++ b/spacy/pt/tokenizer_exceptions.py
@ -0,0 +1,111 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+from ..language_data import PRON_LEMMA
+
+TOKENIZER_EXCEPTIONS = {}
+
+# Contractions
+CONTRACTIONS = {}
+
+personal_pronoun = (
+    "ele", "ela", "eles", "elas"
+)
+demonstrative_pronouns = (
+    "este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas",
+    "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"
+)
+undefined_pronouns = (
+    "outro", "outra", "outros", "outras"
+)
+adverbs = (
+    "aqui", "aí", "ali", "além"
+)
+
+for word in personal_pronoun + demonstrative_pronouns + \
+            undefined_pronouns + adverbs:
+    CONTRACTIONS["d" + word] = [
+        {ORTH: "d", NORM: "de"},
+        {ORTH: word}
+    ]
+
+for word in personal_pronoun + demonstrative_pronouns + \
+            undefined_pronouns:
+    CONTRACTIONS["n" + word] = [
+        {ORTH: "n", NORM: "em"},
+        {ORTH: word}
+    ]
+
+# Not so linear contractions "a"+something
+
+CONTRACTIONS.update({
+    # This one cannot be split into 2
+    # "à": [
+    #     {ORTH: "à", NORM: "a"},
+    #     {ORTH: "", NORM: "a"}
+    # ],
+    "às": [
+        {ORTH: "à", NORM: "a"},
+        {ORTH: "s", NORM: "as"}
+    ],
+    "ao": [
+        {ORTH: "a"},
+        {ORTH: "o"}
+    ],
+    "aos": [
+        {ORTH: "a"},
+        {ORTH: "os"}
+    ],
+    "àquele": [
+        {ORTH: "à", NORM: "a"},
+        {ORTH: "quele", NORM: "aquele"}
+    ],
+    "àquela": [
+        {ORTH: "à", NORM: "a"},
+        {ORTH: "quela", NORM: "aquela"}
+    ],
+    "àqueles": [
+        {ORTH: "à", NORM: "a"},
+        {ORTH: "queles", NORM: "aqueles"}
+    ],
+    "àquelas": [
+        {ORTH: "à", NORM: "a"},
+        {ORTH: "quelas", NORM: "aquelas"}
+    ],
+    "àquilo": [
+        {ORTH: "à", NORM: "a"},
+        {ORTH: "quilo", NORM: "aquilo"}
+    ],
+    "aonde": [
+        {ORTH: "a"},
+        {ORTH: "onde"}
+    ],
+})
+
+TOKENIZER_EXCEPTIONS.update(CONTRACTIONS)
+
+# Abbreviations with only one ORTH token
+
+ORTH_ONLY = [
+    "Adm.",
+    "Dr.",
+    "e.g.",
+    "E.g.",
+    "E.G.",
+    "Gen.",
+    "Gov.",
+    "i.e.",
+    "I.e.",
+    "I.E.",
+    "Jr.",
+    "Ltd.",
+    "p.m.",
+    "Ph.D.",
+    "Rep.",
+    "Rev.",
+    "Sen.",
+    "Sr.",
+    "Sra.",
+    "vs.",
+]
--- a/spacy/util.py
+++ b/spacy/util.py
@ -7,7 +7,6 @@ import re
 import os.path
 import pathlib
 import sys
-
 import textwrap


--- a/website/_harp.json
+++ b/website/_harp.json
@ -55,7 +55,7 @@
            }
        },

-        "V_CSS": "1.2",
+        "V_CSS": "1.3",
        "V_JS": "1.2",
        "DEFAULT_SYNTAX": "python",
        "ANALYTICS": "UA-58931649-1",
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@ -151,6 +151,11 @@
                "url": "https://github.com/golastmile/rasa_nlu",
                "author": "LASTMILE",
                "description": "High level APIs for building your own language parser using existing NLP and ML libraries."
+            },
+            "spacyr": {
+                "url": "https://github.com/kbenoit/spacyr",
+                "author": "Kenneth Benoit",
+                "description": "An R wrapper for spaCy."
            }
        },
        "visualizations": {
--- a/website/docs/usage/troubleshooting.jade
+++ b/website/docs/usage/troubleshooting.jade
@ -33,7 +33,6 @@ p
    |  import the language's #[code Language] class instead, for example
    |  #[code from spacy.fr import French].

-
 +h(3, "symlink-privilege") Symbolic link privilege not held

 +code(false, "text").
@ -51,6 +50,20 @@ p
    |  or use a #[code virtualenv] to install spaCy in a user directory, instead
    |  of doing a system-wide installation.

+h(3, "no-cache-dir") No such option: --no-cache-dir
+
+code(false, "text").
+    no such option: --no-cache-dir
+
+p
+    |  The #[code download] command uses pip to install the models and sets the
+    |  #[code --no-cache-dir] flag to prevent it from requiring too much memory.
+    |  #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
+    |  requires pip v6.0 or newer.
+
+infobox("Solution")
+    |  Run #[code pip install -U pip] to upgrade to the latest version of pip.
+    |  To see which version you have installed, run #[code pip --version].

 +h(3, "import-error") Import error