,

A simple DeTeX function in python - LaTeX to text

The LaTeX logo, typeset with LaTeX
The LaTeX logo, typeset with LaTeX (Photo credit: Wikipedia)
I have implemented a simple DeTeX function in Python. I provide this function below, as is and without any guarantee. If you run it, and it should change the example LaTeX text into "simple" text thanks the detex() function defined in the code.

It's a quick and dirty approach: I did not try to implement the full LaTeX syntax. I just applied a few regexps to strip the commands of the text. Feedback will be appreciated in the comment form below :)

Take care to the "backslash plague" as explained in http://docs.python.org/2/howto/regex.html".

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import re

testMode=False

def applyRegexps(text, listRegExp):
""" Applies successively many regexps to a text"""
if testMode:
print '\n'.join(listRegExp)
# apply all the rules in the ruleset
for element in listRegExp:
left = element['left']
right = element['right']
r=re.compile(left)
text=r.sub(right,text)
return text

"""
_ _ ____
__| | ___| |_ _____ __/ /\ \
/ _` |/ _ \ __/ _ \ \/ / | | |
| (_| | __/ || __/> <| | | |
\__,_|\___|\__\___/_/\_\ | | |
\_\/_/
"""

def detex(latexText):
"""Transform a latex text into a simple text"""
# initialization
regexps=[]
text=latexText
# remove all the contents of the header, ie everything before the first occurence of "\begin{document}"
text = re.sub(r"(?s).*?(\\begin\{document\})", "", text, 1)

# remove comments
regexps.append({r'left':r'([^\\])%.*', 'right':r'\1'})
text= applyRegexps(text, regexps)
regexps=[]

# - replace some LaTeX commands by the contents inside curly rackets
to_reduce = [r'\\emph', r'\\textbf', r'\\textit', r'\\text', r'\\IEEEauthorblockA', r'\\IEEEauthorblockN', r'\\author', r'\\caption',r'\\author',r'\\thanks']
for tag in to_reduce:
regexps.append({'left':tag+r'\{([^\}\{]*)\}', 'right':r'\1'})
text= applyRegexps(text, regexps)
regexps=[]
"""
_ _ _ _ _ _
| |__ (_) __ _| (_) __ _| |__ | |_
| '_ \| |/ _` | | |/ _` | '_ \| __|
| | | | | (_| | | | (_| | | | | |_
|_| |_|_|\__, |_|_|\__, |_| |_|\__|
|___/ |___/
"""
# - replace some LaTeX commands by the contents inside curly brackets and highlight these contents
to_highlight = [r'\\part[\*]*', r'\\chapter[\*]*', r'\\section[\*]*', r'\\subsection[\*]*', r'\\subsubsection[\*]*', r'\\paragraph[\*]*'];
# highlightment pattern: #--content--#
for tag in to_highlight:
regexps.append({'left':tag+r'\{([^\}\{]*)\}','right':r'\n#--\1--#\n'})
# highlightment pattern: [content]
to_highlight = [r'\\title',r'\\author',r'\\thanks',r'\\cite', r'\\ref'];
for tag in to_highlight:
regexps.append({'left':tag+r'\{([^\}\{]*)\}','right':r'[\1]'})
text= applyRegexps(text, regexps)
regexps=[]

"""
_ __ ___ _ __ ___ _____ _____
| '__/ _ \ '_ ` _ \ / _ \ \ / / _ \
| | | __/ | | | | | (_) \ V / __/
|_| \___|_| |_| |_|\___/ \_/ \___|

"""
# remove LaTeX tags
# - remove completely some LaTeX commands that take arguments
to_remove = [r'\\maketitle',r'\\footnote', r'\\centering', r'\\IEEEpeerreviewmaketitle', r'\\includegraphics', r'\\IEEEauthorrefmark', r'\\label', r'\\begin', r'\\end', r'\\big', r'\\right', r'\\left', r'\\documentclass', r'\\usepackage', r'\\bibliographystyle', r'\\bibliography', r'\\cline', r'\\multicolumn']

# replace tag with options and argument by a single space
for tag in to_remove:
regexps.append({'left':tag+r'(\[[^\]]*\])*(\{[^\}\{]*\})*', 'right':r' '})
#regexps.append({'left':tag+r'\{[^\}\{]*\}\[[^\]\[]*\]', 'right':r' '})
text= applyRegexps(text, regexps)
regexps=[]

"""
_
_ __ ___ _ __ | | __ _ ___ ___
| '__/ _ \ '_ \| |/ _` |/ __/ _ \
| | | __/ |_) | | (_| | (_| __/
|_| \___| .__/|_|\__,_|\___\___|
|_|
"""

# - replace some LaTeX commands by the contents inside curly rackets
# replace some symbols by their ascii equivalent
# - common symbols
regexps.append({'left':r'\\eg(\{\})* *','right':r'e.g., '})
regexps.append({'left':r'\\ldots','right':r'...'})
regexps.append({'left':r'\\Rightarrow','right':r'=>'})
regexps.append({'left':r'\\rightarrow','right':r'->'})
regexps.append({'left':r'\\le','right':r'<='})
regexps.append({'left':r'\\ge','right':r'>'})
regexps.append({'left':r'\\_','right':r'_'})
regexps.append({'left':r'\\\\','right':r'\n'})
regexps.append({'left':r'~','right':r' '})
regexps.append({'left':r'\\&','right':r'&'})
regexps.append({'left':r'\\%','right':r'%'})
regexps.append({'left':r'([^\\])&','right':r'\1\t'})
regexps.append({'left':r'\\item','right':r'\t- '})
regexps.append({'left':r'\\\hline[ \t]*\\hline','right':r'============================================='})
regexps.append({'left':r'[ \t]*\\hline','right':r'_____________________________________________'})
# - special letters
regexps.append({'left':r'\\\'{?\{e\}}?','right':r'é'})
regexps.append({'left':r'\\`{?\{a\}}?','right':r'à'})
regexps.append({'left':r'\\\'{?\{o\}}?','right':r'ó'})
regexps.append({'left':r'\\\'{?\{a\}}?','right':r'á'})
# keep untouched the contents of the equations
regexps.append({'left':r'\$(.)\$', 'right':r'\1'})
regexps.append({'left':r'\$([^\$]*)\$', 'right':r'\1'})
# remove the equation symbols ($)
regexps.append({'left':r'([^\\])\$', 'right':r'\1'})
# correct spacing problems
regexps.append({'left':r' +,','right':r','})
regexps.append({'left':r' +','right':r' '})
regexps.append({'left':r' +\)','right':r'\)'})
regexps.append({'left':r'\( +','right':r'\('})
regexps.append({'left':r' +\.','right':r'\.'})
# remove lonely curly brackets
regexps.append({'left':r'^([^\{]*)\}', 'right':r'\1'})
regexps.append({'left':r'([^\\])\{([^\}]*)\}','right':r'\1\2'})
regexps.append({'left':r'\\\{','right':r'\{'})
regexps.append({'left':r'\\\}','right':r'\}'})
# strip white space characters at end of line
regexps.append({'left':r'[ \t]*\n','right':r'\n'})
# remove consecutive blank lines
regexps.append({'left':r'([ \t]*\n){3,}','right':r'\n'})
# apply all those regexps
text= applyRegexps(text, regexps)
regexps=[]
# return the modified text
return text

"""
_
_ __ ___ __ _(_)_ __
| '_ ` _ \ / _` | | '_ \
| | | | | | (_| | | | | |
|_| |_| |_|\__,_|_|_| |_|

"""
def main():
""" Just for debugging"""
#print "defining the test text\n"
latexText=r"""
% This paper can be formatted using the peerreviewca
% (instead of conference) mode.
\documentclass[twocolumn,a4paper]{article}
%\documentclass[peerreviewca]{IEEEtran}
% correct bad hyphenation here
\hyphenation{op-ti-cal net-works semi-con-duc-tor IEEEtran pri-va-cy Au-tho-ri-za-tion}
% package for printing the date and time (version)
\usepackage{time}
\begin{document}
\title{Next Generation Networks}
\author{Tot titi\thanks{Network and Security -- test company -- toto@ieee.org}}
\maketitle
\begin{abstract}\footnote{Version : \today ; \now}
lorem ipsum(\ldots)\end{abstract}
\emph{Keywords: IP Multimedia Subsystem, Quality of Service}
\section{Introduction} \label{sect:introduction}
lorem ipsum(\ldots) \% of the world population. \cite{TISPAN2006a}. \footnote{Bearer Independent Call Control protocol}.
\hline
\section{Protocols used in IMS} \label{sect:protocols}
lorem ipsum(\ldots) \cite{rfc2327, rfc3264}.
\subsection{Authentication, Authorization, and Accounting} \label{sect:protocols_aaa}
lorem ipsum(\ldots)
\subsubsection{Additional protocols} \label{sect:protocols_additional}
lorem ipsum(\ldots)
\begin{table}
\begin{center}
\begin{tabular}{|c|c|c|}
\hline
\textbf{Capability} & \textbf{UE} & \textbf{GGSN} \\ \hline
\emph{DiffServ Edge Function} & Optional & Required \\ \hline
\emph{RSVP/IntServ} & Optional & Optional \\ \hline
\emph{IP Policy Enforcement Point} & Optional & Required \\ \hline
\end{tabular}
\caption{IP Bearer Services Manager capability in the UE and GGSN}
\label{tab_ue_ggsn}
\end{center}
\end{table}
The main transport layer functions are listed below:
\begin{my_itemize}
\item The \emph{Resource Control Enforcement Function} (RCEF) enforces policies under the control of the A-RACF. It opens and closes unidirectional filters called \emph{gates} or \emph{pinholes}, polices traffic and marks IP packets \cite{TISPAN2006c}.
\item The \emph{Border Gateway Function} (BGF) performs policy enforcement and Network Address Translation (NAT) functions under the control of the S-PDF. It operates on unidirectional flows related to a particular session (micro-flows) \cite{TISPAN2006c}.
\item The \emph{Layer 2 Termination Point} (L2TP) terminates the Layer 2 procedures of the access network \cite{TISPAN2006c}.
\end{my_itemize}
Their QoS capabilities are summarized in table \ref{tab_rcef_bgf} \cite{TISPAN2006c}.
The admission control usually follows a three step procedure:
\begin{my_enumerate}
\item Authorization of resources (\eg by the A-RACF)
\item Resource reservation (\eg by the BGF)
\item Resource commitment (\eg by the RCEF)
\end{my_enumerate}
\begin{figure}
\centering
\includegraphics[width=1.5in]{./pictures/RACS_functional_architecture}
\caption{RACS interaction with transfer functions}
\label{fig_RACS_functional_architecture}
\end{figure}
%\subsection{Example} \label{sect:qos_example}
% conference papers do not normally have an appendix
% use section* for acknowledgement
\section*{Acknowledgment}
% optional entry into table of contents (if used)
%\addcontentsline{toc}{section}{Acknowledgment}
lorem ipsum(\ldots)
\bibliographystyle{plain}
%\bibliographystyle{alpha}
\bibliography{./mabiblio}
\end{document}
"""
#print '\n'.join(diff)
text=detex(latexText)
print text


if __name__ == "__main__":
main()
Enjoy!  And feel free to comment below or to put a link to this article on your blog. Thanks!


Share:

No comments:

Post a Comment