mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-29 11:26:28 +03:00
66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
|
import re
|
||
|
import os
|
||
|
from os import path
|
||
|
|
||
|
|
||
|
def parse(sent_text, strip_bad_periods=False):
|
||
|
sent_text = sent_text.strip()
|
||
|
assert sent_text and sent_text.startswith('(')
|
||
|
open_brackets = []
|
||
|
brackets = []
|
||
|
bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))')
|
||
|
word_i = 0
|
||
|
words = []
|
||
|
# Remove outermost bracket
|
||
|
if sent_text.startswith('(('):
|
||
|
sent_text = sent_text.replace('((', '( (', 1)
|
||
|
for match in bracketsRE.finditer(sent_text[2:-1]):
|
||
|
open_, label, text, close = match.groups()
|
||
|
if open_:
|
||
|
assert not close
|
||
|
assert label.strip()
|
||
|
open_brackets.append((label, word_i))
|
||
|
else:
|
||
|
assert close
|
||
|
label, start = open_brackets.pop()
|
||
|
assert label.strip()
|
||
|
if strip_bad_periods and words and _is_bad_period(words[-1], text):
|
||
|
continue
|
||
|
# Traces leave 0-width bracket, but no token
|
||
|
if text and label != '-NONE-':
|
||
|
words.append(text)
|
||
|
word_i += 1
|
||
|
else:
|
||
|
brackets.append((label, start, word_i))
|
||
|
return words, brackets
|
||
|
|
||
|
|
||
|
def _is_bad_period(prev, period):
|
||
|
if period != '.':
|
||
|
return False
|
||
|
elif prev == '.':
|
||
|
return False
|
||
|
elif not prev.endswith('.'):
|
||
|
return False
|
||
|
else:
|
||
|
return True
|
||
|
|
||
|
|
||
|
def split(text):
|
||
|
sentences = []
|
||
|
current = []
|
||
|
|
||
|
for line in text.strip().split('\n'):
|
||
|
line = line.rstrip()
|
||
|
if not line:
|
||
|
continue
|
||
|
# Detect the start of sentences by line starting with (
|
||
|
# This is messy, but it keeps bracket parsing at the sentence level
|
||
|
if line.startswith('(') and current:
|
||
|
sentences.append('\n'.join(current))
|
||
|
current = []
|
||
|
current.append(line)
|
||
|
if current:
|
||
|
sentences.append('\n'.join(current))
|
||
|
return sentences
|