A simple DeTeX function in python - LaTeX to text ~ Himap

The LaTeX logo, typeset with LaTeX (Photo credit: Wikipedia)
I have implemented a simple DeTeX function in Python. I provide this function below, as is and without any guarantee. If you run it, and it should change the example LaTeX text into "simple" text thanks the detex() function defined in the code.

It's a quick and dirty approach: I did not try to implement the full LaTeX syntax. I just applied a few regexps to strip the commands of the text. Feedback will be appreciated in the comment form below :)

Take care to the "backslash plague" as explained in http://docs.python.org/2/howto/regex.html".
#!/usr/bin/python
# -*- coding: UTF-8 -*-
    
import re

testMode=False

def applyRegexps(text, listRegExp):
    """ Applies successively many regexps to a text"""
    if testMode:
        print '\n'.join(listRegExp)
    # apply all the rules in the ruleset
    for element in listRegExp:
        left = element['left']
        right = element['right']
        r=re.compile(left)
        text=r.sub(right,text)
    return text

"""
     _      _             ____  
  __| | ___| |_ _____  __/ /\ \ 
 / _` |/ _ \ __/ _ \ \/ / |  | |
| (_| |  __/ ||  __/>  <| |  | |
 \__,_|\___|\__\___/_/\_\ |  | |
                         \_\/_/ 
"""

def detex(latexText):
    """Transform a latex text into a simple text"""    
    # initialization
    regexps=[]
    text=latexText
    # remove all the contents of the header, ie everything before the first occurence of "\begin{document}"
    text = re.sub(r"(?s).*?(\\begin\{document\})", "", text, 1)
    
    # remove comments
    regexps.append({r'left':r'([^\\])%.*', 'right':r'\1'})
    text= applyRegexps(text, regexps)
    regexps=[]
     
    # - replace some LaTeX commands by the contents inside curly rackets
    to_reduce = [r'\\emph', r'\\textbf', r'\\textit', r'\\text', r'\\IEEEauthorblockA', r'\\IEEEauthorblockN', r'\\author', r'\\caption',r'\\author',r'\\thanks']
    for tag in to_reduce:
      regexps.append({'left':tag+r'\{([^\}\{]*)\}', 'right':r'\1'})
    text= applyRegexps(text, regexps)
    regexps=[]
    """
     _     _       _ _       _     _   
    | |__ (_) __ _| (_) __ _| |__ | |_ 
    | '_ \| |/ _` | | |/ _` | '_ \| __|
    | | | | | (_| | | | (_| | | | | |_ 
    |_| |_|_|\__, |_|_|\__, |_| |_|\__|
             |___/     |___/           
    """
    # - replace some LaTeX commands by the contents inside curly brackets and highlight these contents
    to_highlight = [r'\\part[\*]*', r'\\chapter[\*]*', r'\\section[\*]*', r'\\subsection[\*]*', r'\\subsubsection[\*]*', r'\\paragraph[\*]*'];
    # highlightment pattern: #--content--#
    for tag in to_highlight:
      regexps.append({'left':tag+r'\{([^\}\{]*)\}','right':r'\n#--\1--#\n'})
    # highlightment pattern: [content]
    to_highlight = [r'\\title',r'\\author',r'\\thanks',r'\\cite', r'\\ref'];
    for tag in to_highlight:
      regexps.append({'left':tag+r'\{([^\}\{]*)\}','right':r'[\1]'})
    text= applyRegexps(text, regexps)
    regexps=[]
    
    """
     _ __ ___ _ __ ___   _____   _____ 
    | '__/ _ \ '_ ` _ \ / _ \ \ / / _ \
    | | |  __/ | | | | | (_) \ V /  __/
    |_|  \___|_| |_| |_|\___/ \_/ \___|
                                       
    """
    # remove LaTeX tags
    # - remove completely some LaTeX commands that take arguments
    to_remove = [r'\\maketitle',r'\\footnote', r'\\centering', r'\\IEEEpeerreviewmaketitle', r'\\includegraphics', r'\\IEEEauthorrefmark', r'\\label', r'\\begin', r'\\end', r'\\big', r'\\right', r'\\left', r'\\documentclass', r'\\usepackage', r'\\bibliographystyle', r'\\bibliography',  r'\\cline', r'\\multicolumn']
    
    # replace tag with options and argument by a single space
    for tag in to_remove:
      regexps.append({'left':tag+r'(\[[^\]]*\])*(\{[^\}\{]*\})*', 'right':r' '})
      #regexps.append({'left':tag+r'\{[^\}\{]*\}\[[^\]\[]*\]', 'right':r' '})
    text= applyRegexps(text, regexps)
    regexps=[]

    """
                    _                
     _ __ ___ _ __ | | __ _  ___ ___ 
    | '__/ _ \ '_ \| |/ _` |/ __/ _ \
    | | |  __/ |_) | | (_| | (_|  __/
    |_|  \___| .__/|_|\__,_|\___\___|
             |_|                     
    """
    
    # - replace some LaTeX commands by the contents inside curly rackets
    # replace some symbols by their ascii equivalent
    # - common symbols
    regexps.append({'left':r'\\eg(\{\})* *','right':r'e.g., '})
    regexps.append({'left':r'\\ldots','right':r'...'})
    regexps.append({'left':r'\\Rightarrow','right':r'=>'})
    regexps.append({'left':r'\\rightarrow','right':r'->'})
    regexps.append({'left':r'\\le','right':r'<='})
    regexps.append({'left':r'\\ge','right':r'>'})
    regexps.append({'left':r'\\_','right':r'_'})
    regexps.append({'left':r'\\\\','right':r'\n'})
    regexps.append({'left':r'~','right':r' '})
    regexps.append({'left':r'\\&','right':r'&'})
    regexps.append({'left':r'\\%','right':r'%'})
    regexps.append({'left':r'([^\\])&','right':r'\1\t'})
    regexps.append({'left':r'\\item','right':r'\t- '})
    regexps.append({'left':r'\\\hline[ \t]*\\hline','right':r'============================================='})
    regexps.append({'left':r'[ \t]*\\hline','right':r'_____________________________________________'})
    # - special letters
    regexps.append({'left':r'\\\'{?\{e\}}?','right':r'é'})
    regexps.append({'left':r'\\`{?\{a\}}?','right':r'à'})
    regexps.append({'left':r'\\\'{?\{o\}}?','right':r'ó'})
    regexps.append({'left':r'\\\'{?\{a\}}?','right':r'á'})
    # keep untouched the contents of the equations
    regexps.append({'left':r'\$(.)\$', 'right':r'\1'})
    regexps.append({'left':r'\$([^\$]*)\$', 'right':r'\1'})
    # remove the equation symbols ($)
    regexps.append({'left':r'([^\\])\$', 'right':r'\1'})
    # correct spacing problems
    regexps.append({'left':r' +,','right':r','})
    regexps.append({'left':r' +','right':r' '})
    regexps.append({'left':r' +\)','right':r'\)'})
    regexps.append({'left':r'\( +','right':r'\('})
    regexps.append({'left':r' +\.','right':r'\.'})    
    # remove lonely curly brackets    
    regexps.append({'left':r'^([^\{]*)\}', 'right':r'\1'})
    regexps.append({'left':r'([^\\])\{([^\}]*)\}','right':r'\1\2'})
    regexps.append({'left':r'\\\{','right':r'\{'})
    regexps.append({'left':r'\\\}','right':r'\}'})
    # strip white space characters at end of line
    regexps.append({'left':r'[ \t]*\n','right':r'\n'})
    # remove consecutive blank lines
    regexps.append({'left':r'([ \t]*\n){3,}','right':r'\n'})
    # apply all those regexps
    text= applyRegexps(text, regexps)
    regexps=[]    
    # return the modified text
    return text

"""
                 _       
 _ __ ___   __ _(_)_ __  
| '_ ` _ \ / _` | | '_ \ 
| | | | | | (_| | | | | |
|_| |_| |_|\__,_|_|_| |_|
                         
"""
def main():
    """ Just for debugging"""
    #print "defining the test text\n"
    latexText=r"""
    % This paper can be formatted using the peerreviewca
    % (instead of conference) mode.
    \documentclass[twocolumn,a4paper]{article}
    %\documentclass[peerreviewca]{IEEEtran}
    % correct bad hyphenation here
    \hyphenation{op-ti-cal net-works semi-con-duc-tor IEEEtran pri-va-cy Au-tho-ri-za-tion}
    % package for printing the date and time (version)
    \usepackage{time}
    \begin{document}
    \title{Next Generation Networks}
    \author{Tot titi\thanks{Network and Security -- test company -- toto@ieee.org}}
    \maketitle
    \begin{abstract}\footnote{Version :  \today ;  \now}
    lorem ipsum(\ldots)\end{abstract}
    \emph{Keywords: IP Multimedia Subsystem, Quality of Service}
    \section{Introduction} \label{sect:introduction}
    lorem ipsum(\ldots) \% of the world population. \cite{TISPAN2006a}. \footnote{Bearer Independent Call Control protocol}. 
    \hline
    \section{Protocols used in IMS} \label{sect:protocols}
    lorem ipsum(\ldots) \cite{rfc2327, rfc3264}.
    \subsection{Authentication, Authorization, and Accounting} \label{sect:protocols_aaa}
    lorem ipsum(\ldots)
    \subsubsection{Additional protocols} \label{sect:protocols_additional}
    lorem ipsum(\ldots)
    \begin{table}
        \begin{center}
            \begin{tabular}{|c|c|c|}
            \hline
                \textbf{Capability}                                 & \textbf{UE} & \textbf{GGSN} \\ \hline
                \emph{DiffServ Edge Function}           & Optional      & Required          \\ \hline
                \emph{RSVP/IntServ}                                 & Optional      & Optional          \\ \hline
                \emph{IP Policy Enforcement Point}  & Optional      & Required          \\ \hline
            \end{tabular}
        \caption{IP Bearer Services Manager capability in the UE and GGSN}
        \label{tab_ue_ggsn}
        \end{center}
    \end{table}
     The main transport layer functions are listed below:
    \begin{my_itemize}
        \item The \emph{Resource Control Enforcement Function} (RCEF) enforces policies under the control of the A-RACF. It opens and closes unidirectional filters called \emph{gates} or \emph{pinholes}, polices traffic and marks IP packets \cite{TISPAN2006c}.
        \item  The \emph{Border Gateway Function} (BGF) performs policy enforcement and Network Address Translation (NAT) functions under the control of the S-PDF. It operates on unidirectional flows related to a particular session (micro-flows) \cite{TISPAN2006c}.
        \item  The \emph{Layer 2 Termination Point} (L2TP) terminates the Layer 2 procedures of the access network \cite{TISPAN2006c}.
    \end{my_itemize}
    Their QoS capabilities are summarized in table \ref{tab_rcef_bgf} \cite{TISPAN2006c}.
    The admission control usually follows a three step procedure:
    \begin{my_enumerate}
        \item Authorization of resources (\eg by the A-RACF)
        \item Resource reservation (\eg by the BGF)
        \item Resource commitment (\eg by the RCEF)
    \end{my_enumerate}
    \begin{figure}
    \centering
    \includegraphics[width=1.5in]{./pictures/RACS_functional_architecture}
    \caption{RACS interaction with transfer functions}
    \label{fig_RACS_functional_architecture}
    \end{figure}
    %\subsection{Example}  \label{sect:qos_example}
    % conference papers do not normally have an appendix
    % use section* for acknowledgement
    \section*{Acknowledgment}
    % optional entry into table of contents (if used)
    %\addcontentsline{toc}{section}{Acknowledgment}
    lorem ipsum(\ldots)
    \bibliographystyle{plain}
    %\bibliographystyle{alpha}
    \bibliography{./mabiblio}
    \end{document}
    """
    #print '\n'.join(diff)
    text=detex(latexText)
    print text


if __name__ == "__main__":
    main()
Enjoy! And feel free to comment below or to put a link to this article on your blog. Thanks!
Himap

A simple DeTeX function in python - LaTeX to text

No comments:

Post a Comment

Popular Posts

Recent Posts

Categories

Unordered List

Text Widget

Pages

Blog Archive

Search This Blog

Report Abuse

Facebook

Comments

Contact Form

Tags

Labels

Pages - Menu

About Me

FEATURED POSTS