Difference between revisions of "Arxiv.py"

From String Theory Wiki
Jump to navigation Jump to search
(version 0.2 gunzips PS and source files)
(accommodate new arXiv five digit format)
 
(2 intermediate revisions by 2 users not shown)
Line 1: Line 1:
Copy the following text into a file called arxiv.py with your favourite text editor
Copy the following text into a file called arxiv.py and save it.
 
Further usage instructions are available at [[ArXiv_script]].
<pre>
<pre>
#! /usr/bin/python
#! /usr/bin/python


## arXiv script version 0.2
## arXiv script version 0.3


## Copyright 2008 Tom Brown
## Copyright 2015 Tom Brown


## This program is free software; you can redistribute it and/or
## This program is free software; you can redistribute it and/or
Line 48: Line 50:
'''
'''


__version__ = "0.2"
__version__ = "0.3"
__author__ = "Tom Brown"
__author__ = "Tom Brown"
__copyright__ = "Copyright 2008 Tom Brown, GNU GPL 3"
__copyright__ = "Copyright 2015 Tom Brown, GNU GPL 3"




import sys, os, getopt, re, urllib,gzip
import sys, os, getopt, re, urllib, gzip




def findRefType(ref):
def findRefType(ref):
     ref = ref.replace('arxiv:','')
     ref = ref.replace('arxiv:','')
     if re.search(r'^[a-zA-Z\-]+/\d{7}$',ref):
     if re.search(r'^[a-zA-Z\-\.]+/\d{7}$',ref):
         type = 'old-style eprint'
         type = 'old-style eprint'
     elif re.search(r'^\d{7}$',ref):
     elif re.search(r'^\d{7}$',ref):
         type = 'old-style eprint'
         type = 'old-style eprint'
         ref = 'hep-th/' + ref
         ref = 'hep-th/' + ref
     elif re.search('^\d{4}\.\d{4}$',ref):
     elif re.search('^\d{4}\.\d{4,5}$',ref):
         type = 'new-style eprint'
         type = 'new-style eprint'
     else:
     else:
Line 125: Line 127:


def getComments(html):
def getComments(html):
     if html.count("comments") == 0:
     if "comments" not in html:
         return "no comments"
         return "no comments"
     else:
     else:
Line 134: Line 136:


def getJref(html):
def getJref(html):
     if html.count("jref") == 0:
     if "jref" not in html:
         return "no journal reference"
         return "no journal reference"
     else:
     else:
Line 212: Line 214:


     type, ref = findRefType(ref)
     type, ref = findRefType(ref)
    print "Reference",ref,"is of type",type


     if type=="not arXiv":
     if type=="not arXiv":
Line 217: Line 222:
         sys.exit(0)
         sys.exit(0)


     if (authorOpt+titleOpt+abstractOpt+commentsOpt+jrefOpt > 0):
     if authorOpt+titleOpt+abstractOpt+commentsOpt+jrefOpt > 0:
         htmlObject = urllib.urlopen('http://arxiv.org/abs/' + ref)
         htmlObject = urllib.urlopen('http://arxiv.org/abs/' + ref)
         html = htmlObject.read()
         html = htmlObject.read()
Line 252: Line 257:
     if sourceOpt:
     if sourceOpt:
         downloadSource(ref,type,"")
         downloadSource(ref,type,"")
</pre>
</pre>

Latest revision as of 16:14, 16 August 2015

Copy the following text into a file called arxiv.py and save it.

Further usage instructions are available at ArXiv_script.

#! /usr/bin/python

## arXiv script version 0.3

## Copyright 2015 Tom Brown

## This program is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 3 of the
## License, or (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.

## See http://www.stringwiki.org/wiki/ArXiv_script for more usage
## instructions

'''arXiv script
Usage:
python arxiv.py reference [ -htabcjdps ] [ --help ]
"reference" must be a standard arXiv reference, e.g. hep-th/9711200, 0705.0303.
Options:
-h, --help
displays this help message
-t
displays the title
-a
displays the author(s)
-b
displays the aBstract
-c
displays the comments
-j
displays the journal reference
-d
downloads the PDF
-p
downloads the PS
-s
downloads the source file
'''

__version__ = "0.3"
__author__ = "Tom Brown"
__copyright__ = "Copyright 2015 Tom Brown, GNU GPL 3"


import sys, os, getopt, re, urllib, gzip


def findRefType(ref):
    ref = ref.replace('arxiv:','')
    if re.search(r'^[a-zA-Z\-\.]+/\d{7}$',ref):
        type = 'old-style eprint'
    elif re.search(r'^\d{7}$',ref):
        type = 'old-style eprint'
        ref = 'hep-th/' + ref
    elif re.search('^\d{4}\.\d{4,5}$',ref):
        type = 'new-style eprint'
    else:
        type = 'not arXiv'

    return type, ref




def downloadPDF(ref,type,downloadPath):
    downloadPath = os.path.expanduser(downloadPath)
    if type == 'old-style eprint':
        urllib.urlretrieve('http://arxiv.org/pdf/' + ref, downloadPath + ref.replace('/','-') + '.pdf')
    elif type == 'new-style eprint':
        urllib.urlretrieve('http://arxiv.org/pdf/' + ref, downloadPath + ref + '.pdf')


def downloadPS(ref,type,downloadPath):
    downloadPath = os.path.expanduser(downloadPath)
    filename = downloadPath + ref.replace('/','-')
    urllib.urlretrieve('http://arxiv.org/ps/' + ref, filename)
    gzipFile =  gzip.GzipFile(filename)
    psFile = open(filename + ".ps","w")
    psFile.write(gzipFile.read())
    psFile.close()
    gzipFile.close()
    os.remove(filename)

def downloadSource(ref,type,downloadPath):
    downloadPath = os.path.expanduser(downloadPath)
    filename = downloadPath + ref.replace('/','-')
    urllib.urlretrieve('http://arxiv.org/e-print/' + ref, filename + ".dum")
    gzipFile =  gzip.GzipFile(filename + ".dum")
    sourceFile = open(filename,"w")
    sourceFile.write(gzipFile.read())
    sourceFile.close()
    gzipFile.close()
    os.remove(filename + ".dum")


def getTitle(html):
    title = html[html.find(">Title:</span>")+15:]
    title = title[:title.find("</h1>")]
    return title


def getAuthors(html):
    authors = html[html.find(">Authors:</span>"):]
    authors = authors[authors.find("\">")+2:]
    authors = authors[:authors.find("</div>")]
    authors = re.sub('<[^>]*>','',authors)
    authors = authors.replace("\n","")
    return authors


def getAbstract(html):
    abstract = html[html.find("Abstract:</span>")+17:]
    abstract = abstract[:abstract.find("</blockquote>")-1]
    return abstract

def getComments(html):
    if "comments" not in html:
        return "no comments"
    else:
        comments = html[html.find("comments\">")+10:]
        comments = comments[:comments.find("</td>")]
        return comments


def getJref(html):
    if "jref" not in html:
        return "no journal reference"
    else:
        jref = html[html.find("jref\">")+6:]
        jref = jref[:jref.find("</td>")]
        return jref




if __name__ == "__main__":

    authorOpt = 0
    titleOpt = 0
    abstractOpt = 0
    commentsOpt = 0
    jrefOpt = 0
    pdfOpt = 0
    psOpt = 0
    sourceOpt = 0

    try:
        options, arguments = getopt.gnu_getopt(sys.argv[1:], 
        'hatbcjdpsv', ['help'])
    except getopt.error:
        print 'error: you tried to use an unknown option or the argument for an option that requires it was missing; try \'arxiv.py -h\' for more information'
        sys.exit(0)

    for o,a in options:
        if o in  ('-h','--help'):
            print __doc__
            sys.exit(0)

        elif o == '-a':
            authorOpt = 1

        elif o == '-t':
            titleOpt = 1

        elif o == '-b':
            abstractOpt = 1

        elif o == '-c':
            commentsOpt = 1

        elif o == '-j':
            jrefOpt = 1

        elif o == '-d':
            pdfOpt = 1

        elif o == '-p':
            psOpt = 1

        elif o == '-s':
            sourceOpt = 1


    if len(options) == 0:
        authorOpt = 1
        titleOpt = 1
        abstractOpt = 1
        commentsOpt = 1
        jrefOpt = 1



    if len(arguments) != 1:
        print 'you didn\'t specify an arXiv reference; try \'arxiv.py -h\' for more information'
        sys.exit(0)
    else:
        ref=arguments[0]





    type, ref = findRefType(ref)

    print "Reference",ref,"is of type",type


    if type=="not arXiv":
        print "type not of arXiv form"
        sys.exit(0)

    if authorOpt+titleOpt+abstractOpt+commentsOpt+jrefOpt > 0:
        htmlObject = urllib.urlopen('http://arxiv.org/abs/' + ref)
        html = htmlObject.read()

    if titleOpt:
        title = getTitle(html)
        print title

    if authorOpt:
        authors = getAuthors(html)
        print authors


    if abstractOpt:
        abstract = getAbstract(html)
        print abstract


    if commentsOpt:
        comments = getComments(html)
        print comments


    if jrefOpt:
        jref = getJref(html)
        print jref

    if pdfOpt:
        downloadPDF(ref,type,"")

    if psOpt:
        downloadPS(ref,type,"")

    if sourceOpt:
        downloadSource(ref,type,"")