fish-shell/share/tools/deroff.py

1089 lines
32 KiB
Python
Raw Normal View History

2012-04-08 16:43:30 +08:00
#!/usr/bin/python
# -*- coding: utf-8 -*-
""" Deroff.py, ported to Python from the venerable deroff.c """
2012-04-15 19:41:20 +08:00
import sys, re, string
2012-04-08 16:43:30 +08:00
class Deroffer:
2012-04-15 16:15:10 +08:00
g_specs_specletter = {
# Output composed latin1 letters
'-D': '\320',
'Sd': '\360',
'Tp': '\376',
'TP': '\336',
'AE': '\306',
'ae': '\346',
'OE': "OE",
'oe': "oe",
':a': '\344',
':A': '\304',
':e': '\353',
':E': '\313',
':i': '\357',
':I': '\317',
':o': '\366',
':O': '\326',
':u': '\374',
':U': '\334',
':y': '\377',
'ss': '\337',
'\'A': '\301',
'\'E': '\311',
'\'I': '\315',
'\'O': '\323',
'\'U': '\332',
'\'Y': '\335',
'\'a': '\341',
'\'e': '\351',
'\'i': '\355',
'\'o': '\363',
'\'u': '\372',
'\'y': '\375',
'^A': '\302',
'^E': '\312',
'^I': '\316',
'^O': '\324',
'^U': '\333',
'^a': '\342',
'^e': '\352',
'^i': '\356',
'^o': '\364',
'^u': '\373',
'`A': '\300',
'`E': '\310',
'`I': '\314',
'`O': '\322',
'`U': '\331',
'`a': '\340',
'`e': '\350',
'`i': '\354',
'`o': '\362',
'`u': '\371',
'~A': '\303',
'~N': '\321',
'~O': '\325',
'~a': '\343',
'~n': '\361',
'~o': '\365',
',C': '\307',
',c': '\347',
'/l': "/l",
'/L': "/L",
'/o': '\370',
'/O': '\330',
'oA': '\305',
'oa': '\345',
# Ligatures
'fi': 'fi',
'ff': 'ff',
'fl': 'fl',
'Fi': 'ffi',
'Ff': 'fff',
'Fl': 'ffl'
}
g_specs = {
'mi': '-',
'en': '-',
'hy': '-',
'em': "--",
'lq': "\"", # PCA: This used to be left and right smart quotes, but they look dumb
'rq': "\"", # So just use ordinary double quotes
'Bq': ",,",
'oq': '`',
'cq': '\'',
'aq': '\'',
'dq': '"',
'or': '|',
'at': '@',
'sh': '#',
'Eu': '\244',
'eu': '\244',
'Do': '$',
'ct': '\242',
'Fo': '\253',
'Fc': '\273',
'fo': '<',
'fc': '>',
'r!': '\241',
'r?': '\277',
'Of': '\252',
'Om': '\272',
'pc': '\267',
'S1': '\271',
'S2': '\262',
'S3': '\263',
'<-': "<-",
'->': "->",
'<>': "<->",
'ua': '^',
'da': 'v',
'lA': "<=",
'rA': "=>",
'hA': "<=>",
'uA': "^^",
'dA': "vv",
'ba': '|',
'bb': '|',
'br': '|',
'bv': '|',
'ru': '_',
'ul': '_',
'ci': 'O',
'bu': 'o',
'co': '\251',
'rg': '\256',
'tm': "(TM)",
'dd': "||",
'dg': '|',
'ps': '\266',
'sc': '\247',
'de': '\260',
'%0': "0/00",
'14': '\274',
'12': '\275',
'34': '\276',
'f/': '/',
'sl': '/',
'rs': '\\',
'sq': "[]",
'fm': '\'',
'ha': '^',
'ti': '~',
'lB': '[',
'rB': ']',
'lC': '{',
'rC': '}',
'la': '<',
'ra': '>',
'lh': "<=",
'rh': "=>",
'tf': "therefore",
'~~': "~~",
'~=': "~=",
'!=': "!=",
'**': '*',
'+-': '\261',
'<=': "<=",
'==': "==",
'=~': "=~",
'>=': ">=",
'AN': "\\/",
'OR': "/\\",
'no': '\254',
'te': "there exists",
'fa': "for all",
'Ah': "aleph",
'Im': "imaginary",
'Re': "real",
'if': "infinity",
'md': "\267",
'mo': "member of",
'mu': '\327',
'nm': "not member of",
'pl': '+',
'eq': '=',
'pt': "oc",
'pp': "perpendicular",
'sb': "(=",
'sp': "=)",
'ib': "(-",
'ip': "-)",
'ap': '~',
'is': 'I',
'sr': "root",
'pd': 'd',
'c*': "(x)",
'c+': "(+)",
'ca': "cap",
'cu': 'U',
'di': '\367',
'gr': 'V',
'es': "{}",
'CR': "_|",
'st': "such that",
'/_': "/_",
'lz': "<>",
'an': '-',
# Output Greek
'*A': "Alpha",
'*B': "Beta",
'*C': "Xi",
'*D': "Delta",
'*E': "Epsilon",
'*F': "Phi",
'*G': "Gamma",
'*H': "Theta",
'*I': "Iota",
'*K': "Kappa",
'*L': "Lambda",
'*M': "Mu",
'*N': "Nu",
'*O': "Omicron",
'*P': "Pi",
'*Q': "Psi",
'*R': "Rho",
'*S': "Sigma",
'*T': "Tau",
'*U': "Upsilon",
'*W': "Omega",
'*X': "Chi",
'*Y': "Eta",
'*Z': "Zeta",
'*a': "alpha",
'*b': "beta",
'*c': "xi",
'*d': "delta",
'*e': "epsilon",
'*f': "phi",
'+f': "phi",
'*g': "gamma",
'*h': "theta",
'+h': "theta",
'*i': "iota",
'*k': "kappa",
'*l': "lambda",
'*m': "\265",
'*n': "nu",
'*o': "omicron",
'*p': "pi",
'+p': "omega",
'*q': "psi",
'*r': "rho",
'*s': "sigma",
'*t': "tau",
'*u': "upsilon",
'*w': "omega",
'*x': "chi",
'*y': "eta",
'*z': "zeta",
'ts': "sigma",
}
g_re_word = re.compile(r'[a-zA-Z_]+') # equivalent to the word() method
g_re_number = re.compile(r'[+-]?\d+') # equivalent to the number() method
g_re_esc_char = re.compile(r"""([a-zA-Z_]) | # Word
([+-]?\d) | # Number
\\ # Backslash (for escape seq)
2012-04-15 19:41:20 +08:00
""", re.VERBOSE)
2012-04-15 16:15:10 +08:00
g_re_not_backslash_or_whitespace = re.compile(r'[^ \t\n\r\f\v\\]+') # Match a sequence of not backslash or whitespace
g_re_newline_collapse = re.compile(r'\n{3,}')
2012-04-15 19:41:20 +08:00
g_re_font = re.compile(r"""\\f( # Starts with backslash f
(\(\S{2}) | # Open paren, then two printable chars
(\[\S*?\]) | # Open bracket, zero or more printable characters, then close bracket
\S) # Any printable character
2012-04-16 10:22:30 +08:00
""", re.VERBOSE)
# This gets filled in in __init__ below
g_macro_dict = False
2012-04-08 16:43:30 +08:00
def __init__(self):
self.reg_table = {}
2012-04-15 19:41:20 +08:00
self.tr_from = ''
self.tr_to = ''
self.tr = ''
2012-04-08 16:43:30 +08:00
self.nls = 2
self.specletter = False
self.refer = False
self.macro = 0
self.nobody = False
self.inlist = False
self.inheader = False
self.pic = False
self.tbl = False
self.tblstate = 0
self.tblTab = ''
self.eqn = False
self.skipheaders = False
self.skiplists = False
self.ignore_sonx = False
self.output = []
self.OPTIONS = 0
self.FORMAT = 1
self.DATA = 2
# words is uninteresting and should be treated as false
2012-04-16 10:22:30 +08:00
if not Deroffer.g_macro_dict:
Deroffer.g_macro_dict = {
'SH': Deroffer.macro_sh,
'SS': Deroffer.macro_ss_ip,
'IP': Deroffer.macro_ss_ip,
'H ': Deroffer.macro_ss_ip,
'I ': Deroffer.macro_i_ir,
'IR': Deroffer.macro_i_ir,
'IB': Deroffer.macro_i_ir,
'B ': Deroffer.macro_i_ir,
'BR': Deroffer.macro_i_ir,
'BI': Deroffer.macro_i_ir,
'R ': Deroffer.macro_i_ir,
'RB': Deroffer.macro_i_ir,
'RI': Deroffer.macro_i_ir,
'AB': Deroffer.macro_i_ir,
'] ': Deroffer.macro_close_bracket,
'PS': Deroffer.macro_ps,
'PE': Deroffer.macro_pe,
'TS': Deroffer.macro_ts,
'T&': Deroffer.macro_t_and,
'TE': Deroffer.macro_te,
'EQ': Deroffer.macro_eq,
'EN': Deroffer.macro_en,
'R1': Deroffer.macro_r1,
'R2': Deroffer.macro_r2,
'de': Deroffer.macro_de,
'BL': Deroffer.macro_bl_vl,
'VL': Deroffer.macro_bl_vl,
'AL': Deroffer.macro_bl_vl,
'LB': Deroffer.macro_bl_vl,
'RL': Deroffer.macro_bl_vl,
'ML': Deroffer.macro_bl_vl,
'DL': Deroffer.macro_bl_vl,
'BV': Deroffer.macro_bv,
'LE': Deroffer.macro_le,
'LP': Deroffer.macro_lp_pp,
'PP': Deroffer.macro_lp_pp,
'P\n': Deroffer.macro_lp_pp,
'ds': Deroffer.macro_ds,
'so': Deroffer.macro_so_nx,
'nx': Deroffer.macro_so_nx,
'tr': Deroffer.macro_tr,
'sp': Deroffer.macro_sp
}
2012-04-08 16:43:30 +08:00
def flush_output(self, where):
if where:
2012-04-15 16:15:10 +08:00
where.write(self.get_output())
2012-04-08 16:43:30 +08:00
self.output[:] = []
def get_output(self):
2012-04-15 16:15:10 +08:00
res = ''.join(self.output)
clean_res = Deroffer.g_re_newline_collapse.sub('\n', res)
return clean_res
2012-04-15 19:41:20 +08:00
2012-04-08 16:43:30 +08:00
def putchar(self, c):
2012-04-15 16:15:10 +08:00
self.output.append(c)
2012-04-08 16:43:30 +08:00
return c
2012-04-15 19:41:20 +08:00
# This gets swapped in in place of condputs the first time tr gets modified
def condputs_tr(self, str):
2012-04-15 16:15:10 +08:00
special = self.pic or self.eqn or self.refer or self.macro or (self.skiplists and self.inlist) or (self.skipheaders and self.inheader)
if not special:
2012-04-15 19:41:20 +08:00
self.output.append(str.translate(self.tr))
2012-04-15 16:15:10 +08:00
def condputs(self, str):
special = self.pic or self.eqn or self.refer or self.macro or (self.skiplists and self.inlist) or (self.skipheaders and self.inheader)
2012-04-15 19:41:20 +08:00
if not special:
self.output.append(str)
2012-04-08 16:43:30 +08:00
def str_at(self, idx):
return self.s[idx:idx+1]
def skip_char(self, amt=1):
self.s = self.s[amt:]
def skip_leading_whitespace(self):
self.s = self.s.lstrip()
def is_white(self, idx):
# Note this returns false for empty strings (idx >= len(self.s))
2012-04-15 19:41:20 +08:00
return self.s[idx:idx+1].isspace()
2012-04-08 16:43:30 +08:00
def str_eq(offset, other, len):
return self.s[offset:offset+len] == other[:len]
def prch(self, idx):
2012-04-15 19:41:20 +08:00
# Note that this return False for the empty string (idx >= len(self.s))
ch = self.s[idx:idx+1]
return ch not in ' \t\n'
2012-04-08 16:43:30 +08:00
def font(self):
2012-04-15 19:41:20 +08:00
match = Deroffer.g_re_font.match(self.s)
if not match: return False
self.skip_char(match.end())
return True
def font2(self):
2012-04-16 10:22:30 +08:00
if self.s[0:2] == '\\f':
2012-04-15 19:41:20 +08:00
c = self.str_at(2)
if c == '(' and self.prch(3) and self.prch(4):
2012-04-08 16:43:30 +08:00
self.skip_char(5)
return True
2012-04-15 19:41:20 +08:00
elif c == '[':
2012-04-08 16:43:30 +08:00
self.skip_char(2)
while self.prch(0) and self.str_at(0) != ']': self.skip_char()
if self.str_at(0) == ']': self.skip_char()
elif self.prch(2):
self.skip_char(3)
return True
return False
def comment(self):
2012-04-16 10:22:30 +08:00
# Here we require that the string start with \"
while self.str_at(0) and self.str_at(0) != '\n': self.skip_char()
return True
2012-04-08 16:43:30 +08:00
def numreq(self):
2012-04-16 10:22:30 +08:00
# We require that the string starts with backslash
if self.str_at(1) in 'hvwud' and self.str_at(2) == '\'':
2012-04-08 16:43:30 +08:00
self.macro += 1
self.skip_char(3)
while self.str_at(0) != '\'' and self.esc_char():
pass # Weird
if self.str_at(0) == '\'':
self.skip_char()
self.macro -= 1
return True
return False
def var(self):
reg = ''
2012-04-16 10:22:30 +08:00
s0s1 = self.s[0:2]
if s0s1 == '\\n':
if self.s[3:5] == 'dy':
2012-04-08 16:43:30 +08:00
self.skip_char(5)
return True
elif self.str_at(2) == '(' and self.prch(3) and self.prch(4):
self.skip_char(5)
return True
elif self.str_at(2) == '[' and self.prch(3):
self.skip_char(3)
while self.str_at(0) and self.str_at(0) != ']':
self.skip_char()
return True
elif self.prch(2):
self.skip_char(3)
return True
2012-04-16 10:22:30 +08:00
elif s0s1 == '\\*':
2012-04-08 16:43:30 +08:00
if self.str_at(2) == '(' and self.prch(3) and self.prch(4):
reg = self.s[3:5]
self.skip_char(5)
elif self.str_at(2) == '[' and self.prch(3):
self.skip_char(3)
while self.str_at(0) and self.str_at(0) != ']':
reg = reg + self.str_at(0)
self.skip_char()
2012-04-16 10:22:30 +08:00
if self.s[0:1] == ']':
2012-04-08 16:43:30 +08:00
self.skip_char()
else:
return False
elif self.prch(2):
reg = self.str_at(2)
self.skip_char(3)
else:
return False
if reg in self.reg_table:
old_s = self.s
self.s = self.reg_table[reg]
self.text_arg()
return True
return False
def size(self):
2012-04-16 10:22:30 +08:00
# We require that the string starts with \s
if self.digit(2) or (self.str_at(2) in '-+' and self.digit(3)):
2012-04-08 16:43:30 +08:00
self.skip_char(3)
while self.digit(0): self.skip_char()
return True
return False
def spec(self):
self.specletter = False
2012-04-16 10:22:30 +08:00
if self.s[0:2] == '\\(' and self.prch(2) and self.prch(3):
2012-04-08 16:43:30 +08:00
key = self.s[2:4]
2012-04-15 16:15:10 +08:00
if key in Deroffer.g_specs_specletter:
self.condputs(Deroffer.g_specs_specletter[key])
2012-04-08 16:43:30 +08:00
self.specletter = True
2012-04-15 16:15:10 +08:00
elif key in Deroffer.g_specs:
self.condputs(Deroffer.g_specs[key])
2012-04-08 16:43:30 +08:00
self.skip_char(4)
return True
elif self.s.startswith('\\%'):
self.specletter = True
self.skip_char(2)
return True
else:
return False
def esc(self):
2012-04-16 10:22:30 +08:00
# We require that the string start with backslash
c = self.s[1:2]
if not c: return False
if c in 'eE':
self.condputs('\\')
elif c in 't':
self.condputs('\t')
elif c in '0~':
self.condputs(' ')
elif c in '|^&:':
pass
else:
self.condputs(c)
self.skip_char(2)
return True
2012-04-15 16:15:10 +08:00
2012-04-08 16:43:30 +08:00
def word(self):
2012-04-15 16:15:10 +08:00
got_something = False
while True:
match = Deroffer.g_re_word.match(self.s)
if not match: break
got_something = True
self.condputs(match.group(0))
self.skip_char(match.end(0))
# Consume all specials
while self.spec():
if not self.specletter: break
return got_something
2012-04-08 16:43:30 +08:00
def text(self):
2012-04-15 19:41:20 +08:00
while True:
2012-04-15 16:15:10 +08:00
idx = self.s.find('\\')
if idx == -1:
self.condputs(self.s)
self.s = ''
2012-04-15 19:41:20 +08:00
break
2012-04-15 16:15:10 +08:00
else:
self.condputs(self.s[:idx])
self.skip_char(idx)
2012-04-16 10:22:30 +08:00
if not self.esc_char_backslash():
2012-04-15 19:41:20 +08:00
self.condputs(self.str_at(0))
2012-04-15 16:15:10 +08:00
self.skip_char()
2012-04-08 16:43:30 +08:00
return True
def letter(self, idx):
ch = self.str_at(idx)
return ch.isalpha() or ch == '_' # underscore is used in C identifiers
def digit(self, idx):
ch = self.str_at(idx)
return ch.isdigit()
def number(self):
2012-04-15 16:15:10 +08:00
match = Deroffer.g_re_number.match(self.s)
if not match:
return False
else:
self.condputs(match.group(0))
self.skip_char(match.end())
2012-04-08 16:43:30 +08:00
return True
2012-04-15 19:41:20 +08:00
2012-04-16 10:22:30 +08:00
def esc_char_backslash(self):
# Like esc_char, but we know the string starts with a backslash
c = self.s[1:2]
if c == '"':
return self.comment()
elif c == 'f':
return self.font()
elif c == 's':
return self.size()
elif c in 'hvwud':
return self.numreq()
elif c in 'n*':
return self.var()
elif c == '(':
return self.spec()
else:
return self.esc()
2012-04-08 16:43:30 +08:00
2012-04-15 19:41:20 +08:00
def esc_char(self):
2012-04-16 10:22:30 +08:00
if self.s[0:1] == '\\':
return self.esc_char_backslash()
2012-04-15 19:41:20 +08:00
return self.word() or self.number()
2012-04-08 16:43:30 +08:00
def quoted_arg(self):
if self.str_at(0) == '"':
self.skip_char()
while self.s and self.str_at(0) != '"':
if not self.esc_char():
if self.s:
2012-04-15 19:41:20 +08:00
self.condputs(self.str_at(0))
2012-04-08 16:43:30 +08:00
self.skip_char()
return True
else:
return False
2012-04-15 16:15:10 +08:00
2012-04-08 16:43:30 +08:00
def text_arg(self):
2012-04-15 16:15:10 +08:00
# PCA: The deroff.c textArg() disallowed quotes at the start of an argument
# I'm not sure if this was a bug or not
got_something = False
while True:
match = Deroffer.g_re_not_backslash_or_whitespace.match(self.s)
if match:
# Output the characters in the match
self.condputs(match.group(0))
self.skip_char(match.end(0))
got_something = True
# Next is either an escape, or whitespace, or the end
# If it's the whitespace or the end, we're done
if not self.s or self.is_white(0):
return got_something
# Try an escape
if not self.esc_char():
# Some busted escape? Just output it
2012-04-15 19:41:20 +08:00
self.condputs(self.str_at(0))
2012-04-15 16:15:10 +08:00
self.skip_char()
got_something = True
def text_arg2(self):
2012-04-08 16:43:30 +08:00
if not self.esc_char():
if self.s and not self.is_white(0):
2012-04-15 19:41:20 +08:00
self.condputs(self.str_at(0))
2012-04-08 16:43:30 +08:00
self.skip_char()
else:
return False
while True:
if not self.esc_char():
if self.s and not self.is_white(0):
2012-04-15 19:41:20 +08:00
self.condputs(self.str_at(0))
2012-04-08 16:43:30 +08:00
self.skip_char()
else:
return True
2012-04-16 10:22:30 +08:00
# Macro functions
def macro_sh(self):
for header_str in [' SYNOPSIS', ' "SYNOPSIS', ' BERSICHT', ' "BERSICHT']:
if self.s[2:].startswith(header_str):
self.inheader = True
break
else:
# Did not find a header string
self.inheader = False
self.nobody = True
def macro_ss_ip(self):
self.nobody = True
return False
def macro_i_ir(self):
pass
return False
def macro_close_bracket(self):
self.refer = False
return False
def macro_ps(self):
if self.is_white(2): self.pic = True
self.condputs('\n')
return True
def macro_pe(self):
if self.is_white(2): self.pic = False
self.condputs('\n')
return True
def macro_ts(self):
if self.is_white(2): self.tbl, self.tblstate = True, self.OPTIONS
self.condputs('\n')
return True
def macro_t_and(self):
if self.is_white(2): self.tbl, self.tblstate = True, self.FORMAT
self.condputs('\n')
return True
def macro_te(self):
if self.is_white(2): self.tbl = False
self.condputs('\n')
return True
def macro_eq(self):
if self.is_white(2): self.eqn = True
self.condputs('\n')
return True
def macro_en(self):
if self.is_white(2): self.eqn = False
self.condputs('\n')
return True
def macro_r1(self):
if self.is_white(2): self.refer2 = True
self.condputs('\n')
return True
def macro_r2(self):
if self.is_white(2): self.refer2 = False
self.condputs('\n')
return True
def macro_de(self):
macro=True
self.condputs('\n')
return True
def macro_bl_vl(self):
if self.is_white(2): self.inlist = True
self.condputs('\n')
return True
def macro_bv(self):
if self.str_at(2) == 'L' and self.white(self.str_at(3)): self.inlist = True
self.condputs('\n')
return True
def macro_le(self):
if self.is_white(2): self.inlist = False
self.condputs('\n')
return True
def macro_lp_pp(self):
self.condputs('\n')
return True
def macro_ds(self):
self.skip_char(2)
self.skip_leading_whitespace()
if self.str_at(0):
# Split at whitespace
comps = self.s.split(None, 2)
if len(comps) is 2:
name, value = comps
value = value.rstrip()
self.reg_table[name] = value
self.condputs('\n')
return True
def macro_so_nx(self):
# We always ignore include directives
# deroff.c for some reason allowed this to fall through to the 'tr' case
# I think that was just a bug so I won't replicate it
return True
def macro_tr(self):
self.skip_char(2)
self.skip_leading_whitespace()
while self.s and self.str_at(0) != '\n':
c = self.str_at(0)
ns = self.str_at(1)
self.skip_char(2)
if not ns or ns == '\n': ns = ' '
self.tr_from += c
self.tr_to += ns
# Update our table, then swap in the slower tr-savvy condputs
try: #Python2
self.tr = string.maketrans(self.tr_from, self.tr_to)
except AttributeError: #Python3
self.tr = "".maketrans(self.tr_from, self.tr_to)
2012-04-16 10:22:30 +08:00
self.condputs = self.condputs_tr
return True
def macro_sp(self):
self.condputs('\n')
return True
def macro_other(self):
self.condputs('\n')
return True
2012-04-08 16:43:30 +08:00
def request_or_macro(self):
2012-04-16 10:22:30 +08:00
# s[0] is period or open single quote
self.skip_char()
s0 = self.s[1:2]
if s0 == '\\':
if self.str_at(1) == '"':
self.condputs('\n')
return True
else:
pass
elif s0 == '[':
self.refer = True
self.condputs('\n')
return True
elif s0 == ']':
self.refer = False
self.skip_char()
return self.text()
elif s0 == '.':
self.macro = False
self.condputs('\n')
return True
self.nobody = False
s0s1 = self.s[0:2]
macro_func = Deroffer.g_macro_dict.get(s0s1, Deroffer.macro_other)
if macro_func(self):
return True
if self.skipheaders and self.nobody: return True
self.skip_leading_whitespace()
while self.s and not self.is_white(0): self.skip_char()
self.skip_leading_whitespace()
while True:
if not self.quoted_arg() and not self.text_arg():
if self.s:
self.condputs(self.str_at(0))
self.skip_char()
else:
return True
def request_or_macro2(self):
2012-04-08 16:43:30 +08:00
self.skip_char()
2012-04-16 10:22:30 +08:00
s0 = self.s[0:1]
2012-04-08 16:43:30 +08:00
if s0 == '\\':
if self.str_at(1) == '"':
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
else:
pass
elif s0 == '[':
self.refer = True
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0 == ']':
self.refer = False
self.skip_char()
return self.text()
elif s0 == '.':
self.macro = False
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
self.nobody = False
s0s1 = self.s[0:2]
if s0s1 == 'SH':
for header_str in [' SYNOPSIS', ' "SYNOPSIS', ' BERSICHT', ' "BERSICHT']:
if self.s[2:].startswith(header_str):
self.inheader = True
break
else:
# Did not find a header string
self.inheader = False
self.nobody = True
elif s0s1 in ['SS', 'IP', 'H ']:
self.nobody = True
elif s0s1 in ['I ', 'IR', 'IB', 'B ', 'BR', 'BI', 'R ', 'RB', 'RI', 'AB']:
pass
elif s0s1 in ['] ']:
self.refer = False
elif s0s1 in ['PS']:
if self.is_white(2): self.pic = True
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['PE']:
if self.is_white(2): self.pic = False
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['TS']:
if self.is_white(2): self.tbl, self.tblstate = True, self.OPTIONS
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['T&']:
if self.is_white(2): self.tbl, self.tblstate = True, self.FORMAT
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['TE']:
if self.is_white(2): self.tbl = False
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['EQ']:
if self.is_white(2): self.eqn = True
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['EN']:
if self.is_white(2): self.eqn = False
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['R1']:
if self.is_white(2): self.refer2 = True
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['R2']:
if self.is_white(2): self.refer2 = False
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['de']:
macro=True
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['BL', 'VL', 'AL', 'LB', 'RL', 'ML', 'DL']:
if self.is_white(2): self.inlist = True
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['BV']:
if self.str_at(2) == 'L' and self.white(self.str_at(3)): self.inlist = True
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['LE']:
if self.is_white(2): self.inlist = False
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['LP', 'PP', 'P\n']:
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['ds']:
self.skip_char(2)
self.skip_leading_whitespace()
if self.str_at(0):
# Split at whitespace
comps = self.s.split(None, 2)
if len(comps) is 2:
name, value = comps
value = value.rstrip()
self.reg_table[name] = value
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['so', 'nx']:
# We always ignore include directives
# deroff.c for some reason allowed this to fall through to the 'tr' case
# I think that was just a bug so I won't replicate it
return True
elif s0s1 in ['tr']:
self.skip_char(2)
self.skip_leading_whitespace()
2012-04-15 19:41:20 +08:00
while self.s and self.str_at(0) != '\n':
c = self.str_at(0)
ns = self.str_at(1)
self.skip_char(2)
if not ns or ns == '\n': ns = ' '
self.tr_from += c
self.tr_to += ns
# Update our table, then swap in the slower tr-savvy condputs
try: #Python2
self.tr = string.maketrans(self.tr_from, self.tr_to)
except AttributeError: #Python3
self.tr = "".maketrans(self.tr_from, self.tr_to)
2012-04-15 19:41:20 +08:00
self.condputs = self.condputs_tr
2012-04-08 16:43:30 +08:00
return True
elif s0s1 in ['sp']:
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
else:
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
return True
if self.skipheaders and self.nobody: return True
self.skip_leading_whitespace()
while self.s and not self.is_white(0): self.skip_char()
self.skip_leading_whitespace()
while True:
if not self.quoted_arg() and not self.text_arg():
if self.s:
2012-04-15 19:41:20 +08:00
self.condputs(self.str_at(0))
2012-04-08 16:43:30 +08:00
self.skip_char()
else:
return True
def do_tbl(self):
if self.tblstate == self.OPTIONS:
while self.s and self.str_at(0) != ';' and self.str_at(0) != '\n':
self.skip_leading_whitespace()
if not self.str_at(0).isalpha():
# deroff.c has a bug where it can loop forever here...we try to work around it
self.skip_char()
else: # Parse option
option = self.s
arg = ''
idx = 0
while option[idx:idx+1].isalpha():
idx += 1
if option[idx:idx+1] == '(':
option = option[:idx]
self.s = self.s[idx+1:]
arg = self.s
else:
self.s = ''
if arg:
idx = arg.find(')')
if idx != -1:
arg = arg[:idx]
self.s = self.s[idx+1:]
else:
#self.skip_char()
pass
if option.lower() == 'tab':
self.tblTab = arg[0:1]
self.tblstate = self.FORMAT
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
elif self.tblstate == self.FORMAT:
while self.s and self.str_at(0) != '.' and self.str_at(0) != '\n':
self.skip_leading_whitespace()
if self.str_at(0): self.skip_char()
if self.str_at(0) == '.': self.tblstate = self.DATA
2012-04-15 19:41:20 +08:00
self.condputs('\n')
2012-04-08 16:43:30 +08:00
elif self.tblstate == self.DATA:
if self.tblTab:
self.s = self.s.replace(self.tblTab, '\t')
self.text()
return True
def do_line(self):
2012-04-16 10:22:30 +08:00
if self.s[0:1] in ".'":
2012-04-08 16:43:30 +08:00
if not self.request_or_macro(): return False
elif self.tbl:
self.do_tbl()
else:
self.text()
return True
def deroff(self, str):
lines = str.split('\n')
for line in lines:
2012-04-15 16:15:10 +08:00
self.s = line + '\n'
2012-04-08 16:43:30 +08:00
if not self.do_line():
break
2012-04-15 16:15:10 +08:00
#self.putchar('\n')
2012-04-08 16:43:30 +08:00
def deroff_files(files):
for arg in files:
print >> sys.stderr, arg
if arg.endswith('.gz'):
f = gzip.open(arg, 'r')
else:
f = open(arg, 'r')
str = f.read()
d = Deroffer()
d.deroff(str)
d.flush_output(sys.stdout)
f.close()
if __name__ == "__main__":
import gzip
paths = sys.argv[1:]
2012-04-09 14:26:26 +08:00
if False:
2012-04-08 16:43:30 +08:00
deroff_files(paths)
else:
2012-04-16 10:22:30 +08:00
import cProfile, profile, pstats
profile.run('deroff_files(paths)', 'fooprof')
2012-04-08 16:43:30 +08:00
p = pstats.Stats('fooprof')
2012-04-15 16:15:10 +08:00
p.sort_stats('time').print_stats(100)
2012-04-16 10:22:30 +08:00
#p.sort_stats('calls').print_callers(.5, 'startswith')