mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
* Add initial stuff for Chinese parsing
This commit is contained in:
parent
67ce96c9c9
commit
8569dbc2d0
|
@ -36,11 +36,8 @@ from spacy.strings import hash_string
|
|||
from preshed.counter import PreshCounter
|
||||
|
||||
from spacy.parts_of_speech import NOUN, VERB, ADJ
|
||||
from spacy.util import get_lang_class
|
||||
|
||||
import spacy.en
|
||||
import spacy.de
|
||||
import spacy.fi
|
||||
import spacy.it
|
||||
|
||||
try:
|
||||
unicode
|
||||
|
@ -197,13 +194,6 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
|
|||
|
||||
|
||||
def main(lang_id, lang_data_dir, corpora_dir, model_dir):
|
||||
languages = {
|
||||
'en': spacy.en.English.default_lex_attrs(),
|
||||
'de': spacy.de.German.default_lex_attrs(),
|
||||
'fi': spacy.fi.Finnish.default_lex_attrs(),
|
||||
'it': spacy.it.Italian.default_lex_attrs(),
|
||||
}
|
||||
|
||||
model_dir = Path(model_dir)
|
||||
lang_data_dir = Path(lang_data_dir) / lang_id
|
||||
corpora_dir = Path(corpora_dir) / lang_id
|
||||
|
@ -216,7 +206,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
|
|||
|
||||
tag_map = json.load((lang_data_dir / 'tag_map.json').open())
|
||||
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
|
||||
setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab')
|
||||
setup_vocab(get_lang_class(lang_id).default_lex_attrs(), tag_map, corpora_dir,
|
||||
model_dir / 'vocab')
|
||||
|
||||
if (lang_data_dir / 'gazetteer.json').exists():
|
||||
copyfile(str(lang_data_dir / 'gazetteer.json'),
|
||||
|
|
|
@ -13,8 +13,6 @@ import plac
|
|||
import re
|
||||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
from spacy.de import German
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.gold import read_json_file
|
||||
|
@ -207,7 +205,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
language=("The language to train", "positional", None, str, ['en','de']),
|
||||
language=("The language to train", "positional", None, str, ['en','de', 'zh']),
|
||||
train_loc=("Location of training file or directory"),
|
||||
dev_loc=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
|
@ -223,7 +221,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
|
|||
)
|
||||
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
|
||||
lang = {'en':English, 'de':German}.get(language)
|
||||
lang = spacy.util.get_lang_class(language)
|
||||
|
||||
if not eval_only:
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
|
|
194
lang_data/zh/gazetteer.json
Normal file
194
lang_data/zh/gazetteer.json
Normal file
|
@ -0,0 +1,194 @@
|
|||
{
|
||||
"Reddit": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "reddit"}]
|
||||
]
|
||||
],
|
||||
"SeptemberElevenAttacks": [
|
||||
"EVENT",
|
||||
{},
|
||||
[
|
||||
[
|
||||
{"orth": "9/11"}
|
||||
],
|
||||
[
|
||||
{"lower": "september"},
|
||||
{"orth": "11"}
|
||||
]
|
||||
]
|
||||
],
|
||||
"Linux": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "linux"}]
|
||||
]
|
||||
],
|
||||
"Haskell": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "haskell"}]
|
||||
]
|
||||
],
|
||||
"HaskellCurry": [
|
||||
"PERSON",
|
||||
{},
|
||||
[
|
||||
[
|
||||
{"lower": "haskell"},
|
||||
{"lower": "curry"}
|
||||
]
|
||||
]
|
||||
],
|
||||
"Javascript": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "javascript"}]
|
||||
]
|
||||
],
|
||||
"CSS": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "css"}],
|
||||
[{"lower": "css3"}]
|
||||
]
|
||||
],
|
||||
"displaCy": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "displacy"}]
|
||||
]
|
||||
],
|
||||
"spaCy": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"orth": "spaCy"}]
|
||||
]
|
||||
],
|
||||
|
||||
"HTML": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "html"}],
|
||||
[{"lower": "html5"}]
|
||||
]
|
||||
],
|
||||
"Python": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Python"}]
|
||||
]
|
||||
],
|
||||
"Ruby": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Ruby"}]
|
||||
]
|
||||
],
|
||||
"Digg": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "digg"}]
|
||||
]
|
||||
],
|
||||
"FoxNews": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Fox"}],
|
||||
[{"orth": "News"}]
|
||||
]
|
||||
],
|
||||
"Google": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"lower": "google"}]
|
||||
]
|
||||
],
|
||||
"Mac": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "mac"}]
|
||||
]
|
||||
],
|
||||
"Wikipedia": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "wikipedia"}]
|
||||
]
|
||||
],
|
||||
"Windows": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Windows"}]
|
||||
]
|
||||
],
|
||||
"Dell": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"lower": "dell"}]
|
||||
]
|
||||
],
|
||||
"Facebook": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"lower": "facebook"}]
|
||||
]
|
||||
],
|
||||
"Blizzard": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Blizzard"}]
|
||||
]
|
||||
],
|
||||
"Ubuntu": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Ubuntu"}]
|
||||
]
|
||||
],
|
||||
"Youtube": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "youtube"}]
|
||||
]
|
||||
],
|
||||
"false_positives": [
|
||||
null,
|
||||
{},
|
||||
[
|
||||
[{"orth": "Shit"}],
|
||||
[{"orth": "Weed"}],
|
||||
[{"orth": "Cool"}],
|
||||
[{"orth": "Btw"}],
|
||||
[{"orth": "Bah"}],
|
||||
[{"orth": "Bullshit"}],
|
||||
[{"orth": "Lol"}],
|
||||
[{"orth": "Yo"}, {"lower": "dawg"}],
|
||||
[{"orth": "Yay"}],
|
||||
[{"orth": "Ahh"}],
|
||||
[{"orth": "Yea"}],
|
||||
[{"orth": "Bah"}]
|
||||
]
|
||||
]
|
||||
}
|
6
lang_data/zh/infix.txt
Normal file
6
lang_data/zh/infix.txt
Normal file
|
@ -0,0 +1,6 @@
|
|||
\.\.\.
|
||||
(?<=[a-z])\.(?=[A-Z])
|
||||
(?<=[a-zA-Z])-(?=[a-zA-z])
|
||||
(?<=[a-zA-Z])--(?=[a-zA-z])
|
||||
(?<=[0-9])-(?=[0-9])
|
||||
(?<=[A-Za-z]),(?=[A-Za-z])
|
1
lang_data/zh/morphs.json
Normal file
1
lang_data/zh/morphs.json
Normal file
|
@ -0,0 +1 @@
|
|||
{}
|
21
lang_data/zh/prefix.txt
Normal file
21
lang_data/zh/prefix.txt
Normal file
|
@ -0,0 +1,21 @@
|
|||
,
|
||||
"
|
||||
(
|
||||
[
|
||||
{
|
||||
*
|
||||
<
|
||||
$
|
||||
£
|
||||
“
|
||||
'
|
||||
``
|
||||
`
|
||||
#
|
||||
US$
|
||||
C$
|
||||
A$
|
||||
a-
|
||||
‘
|
||||
....
|
||||
...
|
1
lang_data/zh/specials.json
Normal file
1
lang_data/zh/specials.json
Normal file
|
@ -0,0 +1 @@
|
|||
{}
|
26
lang_data/zh/suffix.txt
Normal file
26
lang_data/zh/suffix.txt
Normal file
|
@ -0,0 +1,26 @@
|
|||
,
|
||||
\"
|
||||
\)
|
||||
\]
|
||||
\}
|
||||
\*
|
||||
\!
|
||||
\?
|
||||
%
|
||||
\$
|
||||
>
|
||||
:
|
||||
;
|
||||
'
|
||||
”
|
||||
''
|
||||
's
|
||||
'S
|
||||
’s
|
||||
’S
|
||||
’
|
||||
\.\.
|
||||
\.\.\.
|
||||
\.\.\.\.
|
||||
(?<=[a-z0-9)\]"'%\)])\.
|
||||
(?<=[0-9])km
|
43
lang_data/zh/tag_map.json
Normal file
43
lang_data/zh/tag_map.json
Normal file
|
@ -0,0 +1,43 @@
|
|||
{
|
||||
"NR": {"pos": "PROPN"},
|
||||
"AD": {"pos": "ADV"},
|
||||
"NN": {"pos": "NOUN"},
|
||||
"CD": {"pos": "NUM"},
|
||||
"DEG": {"pos": "PART"},
|
||||
"PN": {"pos": "PRON"},
|
||||
"M": {"pos": "PART"},
|
||||
"JJ": {"pos": "ADJ"},
|
||||
"DEC": {"pos": "PART"},
|
||||
"NT": {"pos": "NOUN"},
|
||||
"DT": {"pos": "DET"},
|
||||
"LC": {"pos": "PART"},
|
||||
"CC": {"pos": "CONJ"},
|
||||
"AS": {"pos": "PART"},
|
||||
"SP": {"pos": "PART"},
|
||||
"IJ": {"pos": "INTJ"},
|
||||
"OD": {"pos": "NUM"},
|
||||
"MSP": {"pos": "PART"},
|
||||
"CS": {"pos": "SCONJ"},
|
||||
"ETC": {"pos": "PART"},
|
||||
"DEV": {"pos": "PART"},
|
||||
"BA": {"pos": "AUX"},
|
||||
"SB": {"pos": "AUX"},
|
||||
"DER": {"pos": "PART"},
|
||||
"LB": {"pos": "AUX"},
|
||||
"P": {"pos": "ADP"},
|
||||
"URL": {"pos": "SYM"},
|
||||
"FRAG": {"pos": "X"},
|
||||
"X": {"pos": "X"},
|
||||
"ON": {"pos": "X"},
|
||||
"FW": {"pos": "X"},
|
||||
"VC": {"pos": "VERB"},
|
||||
"VV": {"pos": "VERB"},
|
||||
"VA": {"pos": "VERB"},
|
||||
"VE": {"pos": "VERB"},
|
||||
"PU": {"pos": "PUNCT"},
|
||||
"SP": {"pos": "SPACE"},
|
||||
"NP": {"pos": "X"},
|
||||
"_": {"pos": "X"},
|
||||
"VP": {"pos": "X"},
|
||||
"CHAR": {"pos": "X"}
|
||||
}
|
5
spacy/zh/__init__.py
Normal file
5
spacy/zh/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
from ..language import Language
|
||||
|
||||
|
||||
class Chinese(Language):
|
||||
lang = u'zh'
|
Loading…
Reference in New Issue
Block a user