look up bibliographical information from a doi

this python script takes one or more doi as input (command line arguments) and gives bibtex entries back which carry the information provided by crossref. you have to register there and enter the api key they give you in this script (5th line).

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
 
debug = False
 
crossref_api_key = 'your_crossref_api_key'
 
# get the doi
import sys
from string import strip
for arg in sys.argv[1:]:
    arg = strip(arg)
    arg = strip(arg, chars="doi:")
    arg = strip(arg, chars="http://")
    arg = strip(arg, chars="dx.doi.org/")
    doi = strip(arg)
 
    # clear from previous
    text_journal_title = ""
    text_year = ""
    text_volume = ""
    text_issue = ""
    text_title = ""
    text_first_author_surname = ""
    text_first_page = ""
    text_last_page = ""
    authorlist = []
 
    # download the xml
    import urllib
    from xml.dom import minidom
    usock = urllib.urlopen('http://www.crossref.org/openurl/?id=doi:'+doi+'&noredirect=true&pid='+crossref_api_key+'&format=unixref')
    xmldoc = minidom.parse(usock)
    usock.close()
 
    if debug:
        print xmldoc.toxml()
    print ""
 
    a = xmldoc.getElementsByTagName("doi_records")[0]
    b = a.getElementsByTagName("doi_record")[0]
    c = b.getElementsByTagName("crossref")[0]
    d = c.getElementsByTagName("journal")[0]
 
    journal_meta = d.getElementsByTagName("journal_metadata")[0]
    journal_title = journal_meta.getElementsByTagName("full_title")[0]
    text_journal_title = journal_title.firstChild.data#.encode('ascii', 'ignore')
 
    journal_issue = d.getElementsByTagName("journal_issue")[0]
    date = journal_issue.getElementsByTagName("publication_date")[0]
    year = date.getElementsByTagName("year")[0]
    text_year = year.firstChild.data#.encode('ascii', 'ignore')
 
    try:
        journal_volume = journal_issue.getElementsByTagName("journal_volume")[0]
        volume = journal_issue.getElementsByTagName("volume")[0]
        text_volume = volume.firstChild.data#.encode('ascii', 'ignore')
    except IndexError:
        pass
 
    try:
        issue = journal_issue.getElementsByTagName("issue")[0]
        text_issue = issue.firstChild.data#.encode('ascii', 'ignore')
    except IndexError:
        pass
 
    journal_article = d.getElementsByTagName("journal_article")[0]
    titles = journal_article.getElementsByTagName("titles")[0]
    title = titles.getElementsByTagName("title")[0]
    text_title = title.firstChild.data#.encode('ascii', 'ignore')
 
    contributors = journal_article.getElementsByTagName("contributors")[0]
    for person_name in contributors.getElementsByTagName("person_name"):
        text_given_name = ""
        text_surname = ""
        # get names
        given_name = person_name.getElementsByTagName("given_name")[0]
        text_given_name = given_name.firstChild.data#.encode('ascii', 'ignore')
        surname = person_name.getElementsByTagName("surname")[0]
        text_surname = surname.firstChild.data#.encode('ascii', 'ignore')
        authorlist.append(text_surname+", "+text_given_name)
        #first author?
        sequence = person_name.attributes.getNamedItem("sequence")
        if sequence.nodeValue == 'first':
            text_first_author_surname = text_surname
 
    try:
        pages = journal_article.getElementsByTagName("pages")[0]
    except:
        pages = None
    try:
        first_page = pages.getElementsByTagName("first_page")[0]
        text_first_page = first_page.firstChild.data#.encode('ascii', 'ignore')
    except:
        pass
    try:
        last_page = pages.getElementsByTagName("last_page")[0]
        text_last_page = last_page.firstChild.data#.encode('ascii', 'ignore')
    except:
        pass
    # physical review
    if pages == None:
        try:
            pages = journal_article.getElementsByTagName("publisher_item")[0]
        except:
            pages = None
        try:
            first_page = pages.getElementsByTagName("item_number")[0]
            text_first_page = first_page.firstChild.data#.encode('ascii', 'ignore')
        except:
            pass
 
    # output
 
    print "@ARTICLE{"+text_first_author_surname+text_year[-2:]+","
    print "author = {"+" and ".join(authorlist)+"},"
    print "title = {"+text_title+"},"
    print "journal = {"+text_journal_title+"},"
    if not text_volume == "":
        print "volume = {"+text_volume+"},"
    if not text_issue == "":
        print "number = {"+text_issue+"},"
    print "year = {"+text_year+"},"
    if ((text_first_page != "") and (text_last_page != "")):
        print "pages = {"+text_first_page+"-"+text_last_page+"},"
    if ((text_first_page != "") and (text_last_page == "")):
        print "pages = {"+text_first_page+"},"
    print "doi = {"+doi+"},"
    print "}"

One Response to “look up bibliographical information from a doi”

  1. Stian Haklev says:

    This was incredibly useful, thanks a lot. I rewrote it in Ruby to use in an application I am building.

    (it was also fun to think about how to rewrite it in Ruby, I think it ended up a bit more compact, but I don’t know much Python, so it’s possible the Python could have been more compact too).

    If I decide to use this in my application (just experimenting for now) I’d need something that accepts other publication types as well. (Or are DOIs only used for journal articles?)

    require ‘open-uri’
    require ‘xmlsimple’

    def try(&block)
    begin
    yield block
    rescue
    return false
    end
    end

    debug = true

    doi = ARGV[0].gsub(‘doi:’,”).gsub(‘http://’,”).gsub(‘dx.doi.org/’,”).strip
    crossref_api_key = ”

    url = ‘http://www.crossref.org/openurl/?id=doi:’+doi+’&noredirect=true&pid=’+crossref_api_key+’&format=unixref’
    puts url if debug

    html = open(url).read
    puts html if debug

    doc = XmlSimple.xml_in(html)

    cit = doc[‘doi_record’][0][‘crossref’][0][‘journal’][0]
    journal_meta = cit[“journal_metadata”][0]
    journal_title = journal_meta[‘full_title’][0]
    journal_issue = cit[“journal_issue”][0]
    date = journal_issue[‘publication_date’][0]
    year = date[‘year’][0]
    journal_volume = try {journal_issue[‘journal_volume’][0]}
    volume = try {journal_issue[‘volume’][0]}
    issue = try {journal_issue[‘issue’][0]}
    journal_article = cit[‘journal_article’][0]
    titles = journal_article[‘titles’][0]
    title = titles[‘title’][0]

    contributors = journal_article[‘contributors’][0]

    first_author_surname = ”
    authorlist = []
    contributors[‘person_name’].each do |person_name|
    given_name = person_name[‘given_name’][0]
    surname = person_name[‘surname’][0]
    authorlist << "#{surname}, #{given_name}"
    if person_name['sequence'].strip == 'first'
    first_author_surname = surname
    end
    end

    pages = try {journal_article['pages'][0]}
    first_page = try {pages['first_page'][0]}
    last_page = try {pages['last_page'][0]}

    unless pages
    pages = try{journal_article['publisher_item'][0]}
    first_page = try{pages['item_number'][0]}
    end

    out = ''
    citekey = "#{first_author_surname}#{year}#{title.split(" ")[0]}".downcase
    out << "@ARTICLE{#{citekey},\n"
    out << "author = {#{authorlist.join(" and ")}},\n"
    out << "title = {#{title}},\n"
    out << "journal = {#{journal_title}},\n"
    out << "volume = {#{volume}},\n" if volume
    out << "number = {#{issue}},\n" if issue
    out << "year = {#{year}},\n"
    if first_page
    out << "pages = {#{first_page}"
    out << "-#{last_page}" if last_page
    out << "},\n"
    end
    out << "doi = {#{doi}},\n}"

    puts out

Leave a Reply