diff --git a/bin/ner_tag.py b/bin/ner_tag.py new file mode 100644 index 000000000..e7ec1e51e --- /dev/null +++ b/bin/ner_tag.py @@ -0,0 +1,34 @@ +import codecs +import plac + +from spacy.en import English + + +def main(text_loc): + with codecs.open(text_loc, 'r', 'utf8') as file_: + text = file_.read() + NLU = English() + for paragraph in text.split('\n\n'): + tokens = NLU(paragraph) + + ent_starts = {} + ent_ends = {} + for span in tokens.ents: + ent_starts[span.start] = span.label_ + ent_ends[span.end] = span.label_ + + output = [] + for token in tokens: + if token.i in ent_starts: + output.append('<%s>' % ent_starts[token.i]) + output.append(token.orth_) + if (token.i+1) in ent_ends: + output.append('' % ent_ends[token.i+1]) + output.append('\n\n') + print ' '.join(output) + + +if __name__ == '__main__': + plac.call(main) + +