Merge branch 'develop' into spacy.io

2025-11-01 08:27:44 +03:00 · 2019-03-06 14:41:25 +01:00 · 2019-03-06 14:41:25 +01:00 · 0c09831227
commit 0c09831227
parent 6923928380 e9babd9973
2 changed files with 16 additions and 22 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals, print_function
 import re
 from collections import namedtuple

+from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 from ...attrs import LANG
 from ...language import Language
@ -38,24 +39,20 @@ def resolve_pos(token):
    in the sentence. This function adds information to the POS tag to
    resolve ambiguous mappings.
    """
-
    # TODO: This is a first take. The rules here are crude approximations.
    # For many of these, full dependencies are needed to properly resolve
    # PoS mappings.
-
    if token.pos == "連体詞,*,*,*":
        if re.match(r"[こそあど此其彼]の", token.surface):
            return token.pos + ",DET"
        if re.match(r"[こそあど此其彼]", token.surface):
            return token.pos + ",PRON"
        return token.pos + ",ADJ"
-
    return token.pos


 def detailed_tokens(tokenizer, text):
    """Format Mecab output into a nice data structure, based on Janome."""
-
    node = tokenizer.parseToNode(text)
    node = node.next  # first node is beginning of sentence and empty, skip it
    words = []
@ -64,12 +61,10 @@ def detailed_tokens(tokenizer, text):
        base = surface  # a default value. Updated if available later.
        parts = node.feature.split(",")
        pos = ",".join(parts[0:4])
-
        if len(parts) > 7:
            # this information is only available for words in the tokenizer
            # dictionary
            base = parts[7]
-
        words.append(ShortUnitWord(surface, base, pos))
        node = node.next
    return words
@ -78,29 +73,25 @@ def detailed_tokens(tokenizer, text):
 class JapaneseTokenizer(DummyTokenizer):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-
        self.tokenizer = try_mecab_import().Tagger()
        self.tokenizer.parseToNode("")  # see #2901

    def __call__(self, text):
        dtokens = detailed_tokens(self.tokenizer, text)
-
        words = [x.surface for x in dtokens]
        spaces = [False] * len(words)
        doc = Doc(self.vocab, words=words, spaces=spaces)
-
        for token, dtoken in zip(doc, dtokens):
            token._.mecab_tag = dtoken.pos
            token.tag_ = resolve_pos(dtoken)
            token.lemma_ = dtoken.lemma
-
        return doc


 class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda _text: "ja"
-
+    stop_words = STOP_WORDS
    tag_map = TAG_MAP

    @classmethod
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -208,21 +208,24 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]

 ### Environment variables for hyperparameters {#train-hyperparams new="2"}

-spaCy lets you set hyperparameters for training via environment variables. This
-is useful, because it keeps the command simple and allows you to
-[create an alias](https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537)
-for your custom `train` command while still being able to easily tweak the
-hyperparameters. For example:
+spaCy lets you set hyperparameters for training via environment variables. For
+example:

 ```bash
-$ parser_hidden_depth=2 parser_maxout_pieces=1 spacy train [...]
+$ token_vector_width=256 learn_rate=0.0001 spacy train [...]
 ```

-```bash
-### Usage with alias
-alias train-parser="spacy train en /output /data /train /dev -n 1000"
-parser_maxout_pieces=1 train-parser
-```
+> #### Usage with alias
+>
+> Environment variables keep the command simple and allow you to to
+> [create an alias](https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537)
+> for your custom `train` command while still being able to easily tweak the
+> hyperparameters.
+>
+> ```bash
+> alias train-parser="python -m spacy train en /output /data /train /dev -n 1000"
+> token_vector_width=256 train-parser
+> ```

 | Name                 | Description                                         | Default |
 | -------------------- | --------------------------------------------------- | ------- |