this python script takes one or more doi as input (command line arguments) and gives bibtex entries back which carry the information provided by crossref. you have to register there and enter the api key they give you in this script (5th line).
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | #!/usr/bin/env python debug = False crossref_api_key = 'your_crossref_api_key' # get the doi import sys from string import strip for arg in sys.argv[1:]: arg = strip(arg) arg = strip(arg, chars="doi:") arg = strip(arg, chars="http://") arg = strip(arg, chars="dx.doi.org/") doi = strip(arg) # clear from previous text_journal_title = "" text_year = "" text_volume = "" text_issue = "" text_title = "" text_first_author_surname = "" text_first_page = "" text_last_page = "" authorlist = [] # download the xml import urllib from xml.dom import minidom usock = urllib.urlopen('http://www.crossref.org/openurl/?id=doi:'+doi+'&noredirect=true&pid='+crossref_api_key+'&format=unixref') xmldoc = minidom.parse(usock) usock.close() if debug: print xmldoc.toxml() print "" a = xmldoc.getElementsByTagName("doi_records")[0] b = a.getElementsByTagName("doi_record")[0] c = b.getElementsByTagName("crossref")[0] d = c.getElementsByTagName("journal")[0] journal_meta = d.getElementsByTagName("journal_metadata")[0] journal_title = journal_meta.getElementsByTagName("full_title")[0] text_journal_title = journal_title.firstChild.data#.encode('ascii', 'ignore') journal_issue = d.getElementsByTagName("journal_issue")[0] date = journal_issue.getElementsByTagName("publication_date")[0] year = date.getElementsByTagName("year")[0] text_year = year.firstChild.data#.encode('ascii', 'ignore') try: journal_volume = journal_issue.getElementsByTagName("journal_volume")[0] volume = journal_issue.getElementsByTagName("volume")[0] text_volume = volume.firstChild.data#.encode('ascii', 'ignore') except IndexError: pass try: issue = journal_issue.getElementsByTagName("issue")[0] text_issue = issue.firstChild.data#.encode('ascii', 'ignore') except IndexError: pass journal_article = d.getElementsByTagName("journal_article")[0] titles = journal_article.getElementsByTagName("titles")[0] title = titles.getElementsByTagName("title")[0] text_title = title.firstChild.data#.encode('ascii', 'ignore') contributors = journal_article.getElementsByTagName("contributors")[0] for person_name in contributors.getElementsByTagName("person_name"): text_given_name = "" text_surname = "" # get names given_name = person_name.getElementsByTagName("given_name")[0] text_given_name = given_name.firstChild.data#.encode('ascii', 'ignore') surname = person_name.getElementsByTagName("surname")[0] text_surname = surname.firstChild.data#.encode('ascii', 'ignore') authorlist.append(text_surname+", "+text_given_name) #first author? sequence = person_name.attributes.getNamedItem("sequence") if sequence.nodeValue == 'first': text_first_author_surname = text_surname try: pages = journal_article.getElementsByTagName("pages")[0] except: pages = None try: first_page = pages.getElementsByTagName("first_page")[0] text_first_page = first_page.firstChild.data#.encode('ascii', 'ignore') except: pass try: last_page = pages.getElementsByTagName("last_page")[0] text_last_page = last_page.firstChild.data#.encode('ascii', 'ignore') except: pass # physical review if pages == None: try: pages = journal_article.getElementsByTagName("publisher_item")[0] except: pages = None try: first_page = pages.getElementsByTagName("item_number")[0] text_first_page = first_page.firstChild.data#.encode('ascii', 'ignore') except: pass # output print "@ARTICLE{"+text_first_author_surname+text_year[-2:]+"," print "author = {"+" and ".join(authorlist)+"}," print "title = {"+text_title+"}," print "journal = {"+text_journal_title+"}," if not text_volume == "": print "volume = {"+text_volume+"}," if not text_issue == "": print "number = {"+text_issue+"}," print "year = {"+text_year+"}," if ((text_first_page != "") and (text_last_page != "")): print "pages = {"+text_first_page+"-"+text_last_page+"}," if ((text_first_page != "") and (text_last_page == "")): print "pages = {"+text_first_page+"}," print "doi = {"+doi+"}," print "}" |
This was incredibly useful, thanks a lot. I rewrote it in Ruby to use in an application I am building.
(it was also fun to think about how to rewrite it in Ruby, I think it ended up a bit more compact, but I don’t know much Python, so it’s possible the Python could have been more compact too).
If I decide to use this in my application (just experimenting for now) I’d need something that accepts other publication types as well. (Or are DOIs only used for journal articles?)
require ‘open-uri’
require ‘xmlsimple’
def try(&block)
begin
yield block
rescue
return false
end
end
debug = true
doi = ARGV[0].gsub(‘doi:’,”).gsub(‘http://’,”).gsub(‘dx.doi.org/’,”).strip
crossref_api_key = ”
url = ‘http://www.crossref.org/openurl/?id=doi:’+doi+’&noredirect=true&pid=’+crossref_api_key+’&format=unixref’
puts url if debug
html = open(url).read
puts html if debug
doc = XmlSimple.xml_in(html)
cit = doc[‘doi_record’][0][‘crossref’][0][‘journal’][0]
journal_meta = cit[“journal_metadata”][0]
journal_title = journal_meta[‘full_title’][0]
journal_issue = cit[“journal_issue”][0]
date = journal_issue[‘publication_date’][0]
year = date[‘year’][0]
journal_volume = try {journal_issue[‘journal_volume’][0]}
volume = try {journal_issue[‘volume’][0]}
issue = try {journal_issue[‘issue’][0]}
journal_article = cit[‘journal_article’][0]
titles = journal_article[‘titles’][0]
title = titles[‘title’][0]
contributors = journal_article[‘contributors’][0]
first_author_surname = ”
authorlist = []
contributors[‘person_name’].each do |person_name|
given_name = person_name[‘given_name’][0]
surname = person_name[‘surname’][0]
authorlist << "#{surname}, #{given_name}"
if person_name['sequence'].strip == 'first'
first_author_surname = surname
end
end
pages = try {journal_article['pages'][0]}
first_page = try {pages['first_page'][0]}
last_page = try {pages['last_page'][0]}
unless pages
pages = try{journal_article['publisher_item'][0]}
first_page = try{pages['item_number'][0]}
end
out = ''
citekey = "#{first_author_surname}#{year}#{title.split(" ")[0]}".downcase
out << "@ARTICLE{#{citekey},\n"
out << "author = {#{authorlist.join(" and ")}},\n"
out << "title = {#{title}},\n"
out << "journal = {#{journal_title}},\n"
out << "volume = {#{volume}},\n" if volume
out << "number = {#{issue}},\n" if issue
out << "year = {#{year}},\n"
if first_page
out << "pages = {#{first_page}"
out << "-#{last_page}" if last_page
out << "},\n"
end
out << "doi = {#{doi}},\n}"
puts out