look up bibliographical information from a doi

this python script takes one or more doi as input (command line arguments) and gives bibtex entries back which carry the information provided by crossref. you have to register there and enter the api key they give you in this script (5th line).

#!/usr/bin/env python
 
debug = False
 
crossref_api_key = 'your_crossref_api_key'
 
# get the doi
import sys
from string import strip
for arg in sys.argv[1:]:
    arg = strip(arg)
    arg = strip(arg, chars="doi:")
    arg = strip(arg, chars="http://")
    arg = strip(arg, chars="dx.doi.org/")
    doi = strip(arg)
 
    # clear from previous
    text_journal_title = ""
    text_year = ""
    text_volume = ""
    text_issue = ""
    text_title = ""
    text_first_author_surname = ""
    text_first_page = ""
    text_last_page = ""
    authorlist = []
 
    # download the xml
    import urllib
    from xml.dom import minidom
    usock = urllib.urlopen('http://www.crossref.org/openurl/?id=doi:'+doi+'&noredirect=true&pid='+crossref_api_key+'&format=unixref')
    xmldoc = minidom.parse(usock)
    usock.close()
 
    if debug:
        print xmldoc.toxml()
    print ""
 
    a = xmldoc.getElementsByTagName("doi_records")[0]
    b = a.getElementsByTagName("doi_record")[0]
    c = b.getElementsByTagName("crossref")[0]
    d = c.getElementsByTagName("journal")[0]
 
    journal_meta = d.getElementsByTagName("journal_metadata")[0]
    journal_title = journal_meta.getElementsByTagName("full_title")[0]
    text_journal_title = journal_title.firstChild.data#.encode('ascii', 'ignore')
 
    journal_issue = d.getElementsByTagName("journal_issue")[0]
    date = journal_issue.getElementsByTagName("publication_date")[0]
    year = date.getElementsByTagName("year")[0]
    text_year = year.firstChild.data#.encode('ascii', 'ignore')
 
    try:
        journal_volume = journal_issue.getElementsByTagName("journal_volume")[0]
        volume = journal_issue.getElementsByTagName("volume")[0]
        text_volume = volume.firstChild.data#.encode('ascii', 'ignore')
    except IndexError:
        pass
 
    try:
        issue = journal_issue.getElementsByTagName("issue")[0]
        text_issue = issue.firstChild.data#.encode('ascii', 'ignore')
    except IndexError:
        pass
 
    journal_article = d.getElementsByTagName("journal_article")[0]
    titles = journal_article.getElementsByTagName("titles")[0]
    title = titles.getElementsByTagName("title")[0]
    text_title = title.firstChild.data#.encode('ascii', 'ignore')
 
    contributors = journal_article.getElementsByTagName("contributors")[0]
    for person_name in contributors.getElementsByTagName("person_name"):
        text_given_name = ""
        text_surname = ""
        # get names
        given_name = person_name.getElementsByTagName("given_name")[0]
        text_given_name = given_name.firstChild.data#.encode('ascii', 'ignore')
        surname = person_name.getElementsByTagName("surname")[0]
        text_surname = surname.firstChild.data#.encode('ascii', 'ignore')
        authorlist.append(text_surname+", "+text_given_name)
        #first author?
        sequence = person_name.attributes.getNamedItem("sequence")
        if sequence.nodeValue == 'first':
            text_first_author_surname = text_surname
 
    try:
        pages = journal_article.getElementsByTagName("pages")[0]
    except:
        pages = None
    try:
        first_page = pages.getElementsByTagName("first_page")[0]
        text_first_page = first_page.firstChild.data#.encode('ascii', 'ignore')
    except:
        pass
    try:
        last_page = pages.getElementsByTagName("last_page")[0]
        text_last_page = last_page.firstChild.data#.encode('ascii', 'ignore')
    except:
        pass
    # physical review
    if pages == None:
        try:
            pages = journal_article.getElementsByTagName("publisher_item")[0]
        except:
            pages = None
        try:
            first_page = pages.getElementsByTagName("item_number")[0]
            text_first_page = first_page.firstChild.data#.encode('ascii', 'ignore')
        except:
            pass
 
    # output
 
    print "@ARTICLE{"+text_first_author_surname+text_year[-2:]+","
    print "author = {"+" and ".join(authorlist)+"},"
    print "title = {"+text_title+"},"
    print "journal = {"+text_journal_title+"},"
    if not text_volume == "":
        print "volume = {"+text_volume+"},"
    if not text_issue == "":
        print "number = {"+text_issue+"},"
    print "year = {"+text_year+"},"
    if ((text_first_page != "") and (text_last_page != "")):
        print "pages = {"+text_first_page+"-"+text_last_page+"},"
    if ((text_first_page != "") and (text_last_page == "")):
        print "pages = {"+text_first_page+"},"
    print "doi = {"+doi+"},"
    print "}"

#!/usr/bin/env python debug = False crossref_api_key = 'your_crossref_api_key' # get the doi import sys from string import strip for arg in sys.argv[1:]: arg = strip(arg) arg = strip(arg, chars="doi:") arg = strip(arg, chars="http://") arg = strip(arg, chars="dx.doi.org/") doi = strip(arg) # clear from previous text_journal_title = "" text_year = "" text_volume = "" text_issue = "" text_title = "" text_first_author_surname = "" text_first_page = "" text_last_page = "" authorlist = [] # download the xml import urllib from xml.dom import minidom usock = urllib.urlopen('http://www.crossref.org/openurl/?id=doi:'+doi+'&noredirect=true&pid='+crossref_api_key+'&format=unixref') xmldoc = minidom.parse(usock) usock.close() if debug: print xmldoc.toxml() print "" a = xmldoc.getElementsByTagName("doi_records")[0] b = a.getElementsByTagName("doi_record")[0] c = b.getElementsByTagName("crossref")[0] d = c.getElementsByTagName("journal")[0] journal_meta = d.getElementsByTagName("journal_metadata")[0] journal_title = journal_meta.getElementsByTagName("full_title")[0] text_journal_title = journal_title.firstChild.data#.encode('ascii', 'ignore') journal_issue = d.getElementsByTagName("journal_issue")[0] date = journal_issue.getElementsByTagName("publication_date")[0] year = date.getElementsByTagName("year")[0] text_year = year.firstChild.data#.encode('ascii', 'ignore') try: journal_volume = journal_issue.getElementsByTagName("journal_volume")[0] volume = journal_issue.getElementsByTagName("volume")[0] text_volume = volume.firstChild.data#.encode('ascii', 'ignore') except IndexError: pass try: issue = journal_issue.getElementsByTagName("issue")[0] text_issue = issue.firstChild.data#.encode('ascii', 'ignore') except IndexError: pass journal_article = d.getElementsByTagName("journal_article")[0] titles = journal_article.getElementsByTagName("titles")[0] title = titles.getElementsByTagName("title")[0] text_title = title.firstChild.data#.encode('ascii', 'ignore') contributors = journal_article.getElementsByTagName("contributors")[0] for person_name in contributors.getElementsByTagName("person_name"): text_given_name = "" text_surname = "" # get names given_name = person_name.getElementsByTagName("given_name")[0] text_given_name = given_name.firstChild.data#.encode('ascii', 'ignore') surname = person_name.getElementsByTagName("surname")[0] text_surname = surname.firstChild.data#.encode('ascii', 'ignore') authorlist.append(text_surname+", "+text_given_name) #first author? sequence = person_name.attributes.getNamedItem("sequence") if sequence.nodeValue == 'first': text_first_author_surname = text_surname try: pages = journal_article.getElementsByTagName("pages")[0] except: pages = None try: first_page = pages.getElementsByTagName("first_page")[0] text_first_page = first_page.firstChild.data#.encode('ascii', 'ignore') except: pass try: last_page = pages.getElementsByTagName("last_page")[0] text_last_page = last_page.firstChild.data#.encode('ascii', 'ignore') except: pass # physical review if pages == None: try: pages = journal_article.getElementsByTagName("publisher_item")[0] except: pages = None try: first_page = pages.getElementsByTagName("item_number")[0] text_first_page = first_page.firstChild.data#.encode('ascii', 'ignore') except: pass # output print "@ARTICLE{"+text_first_author_surname+text_year[-2:]+"," print "author = {"+" and ".join(authorlist)+"}," print "title = {"+text_title+"}," print "journal = {"+text_journal_title+"}," if not text_volume == "": print "volume = {"+text_volume+"}," if not text_issue == "": print "number = {"+text_issue+"}," print "year = {"+text_year+"}," if ((text_first_page != "") and (text_last_page != "")): print "pages = {"+text_first_page+"-"+text_last_page+"}," if ((text_first_page != "") and (text_last_page == "")): print "pages = {"+text_first_page+"}," print "doi = {"+doi+"}," print "}"

This entry was posted on Wednesday, October 5th, 2011 at 14:26 and is filed under misc. You can follow any responses to this entry through the RSS 2.0 feed. You can leave a response, or trackback from your own site.

One Response to “look up bibliographical information from a doi”

Stian Haklev says:

2012/02/21 at 17:25

This was incredibly useful, thanks a lot. I rewrote it in Ruby to use in an application I am building.

(it was also fun to think about how to rewrite it in Ruby, I think it ended up a bit more compact, but I don’t know much Python, so it’s possible the Python could have been more compact too).

If I decide to use this in my application (just experimenting for now) I’d need something that accepts other publication types as well. (Or are DOIs only used for journal articles?)

require ‘open-uri’
require ‘xmlsimple’

def try(&block)
begin
yield block
rescue
return false
end
end

debug = true

doi = ARGV[0].gsub(‘doi:’,”).gsub(‘http://’,”).gsub(‘dx.doi.org/’,”).strip
crossref_api_key = ”

url = ‘http://www.crossref.org/openurl/?id=doi:’+doi+’&noredirect=true&pid=’+crossref_api_key+’&format=unixref’
puts url if debug

html = open(url).read
puts html if debug

doc = XmlSimple.xml_in(html)

cit = doc[‘doi_record’][0][‘crossref’][0][‘journal’][0]
journal_meta = cit[“journal_metadata”][0]
journal_title = journal_meta[‘full_title’][0]
journal_issue = cit[“journal_issue”][0]
date = journal_issue[‘publication_date’][0]
year = date[‘year’][0]
journal_volume = try {journal_issue[‘journal_volume’][0]}
volume = try {journal_issue[‘volume’][0]}
issue = try {journal_issue[‘issue’][0]}
journal_article = cit[‘journal_article’][0]
titles = journal_article[‘titles’][0]
title = titles[‘title’][0]

contributors = journal_article[‘contributors’][0]

first_author_surname = ”
authorlist = []
contributors[‘person_name’].each do |person_name|
given_name = person_name[‘given_name’][0]
surname = person_name[‘surname’][0]
authorlist << "#{surname}, #{given_name}"
if person_name['sequence'].strip == 'first'
first_author_surname = surname
end
end

pages = try {journal_article['pages'][0]}
first_page = try {pages['first_page'][0]}
last_page = try {pages['last_page'][0]}

unless pages
pages = try{journal_article['publisher_item'][0]}
first_page = try{pages['item_number'][0]}
end

out = ''
citekey = "#{first_author_surname}#{year}#{title.split(" ")[0]}".downcase
out << "@ARTICLE{#{citekey},\n"
out << "author = {#{authorlist.join(" and ")}},\n"
out << "title = {#{title}},\n"
out << "journal = {#{journal_title}},\n"
out << "volume = {#{volume}},\n" if volume
out << "number = {#{issue}},\n" if issue
out << "year = {#{year}},\n"
if first_page
out << "pages = {#{first_page}"
out << "-#{last_page}" if last_page
out << "},\n"
end
out << "doi = {#{doi}},\n}"

puts out

look up bibliographical information from a doi

One Response to “look up bibliographical information from a doi”

Leave a Reply