Skip to content

Commit

Permalink
Merge pull request academicpages#197 from mborowczak/pubsfrombib
Browse files Browse the repository at this point in the history
Ability to generate publications from bibtex files for academicpages#2
  • Loading branch information
staeiou authored Feb 19, 2019
2 parents 1859ec1 + 9458158 commit de37ec3
Show file tree
Hide file tree
Showing 2 changed files with 383 additions and 0 deletions.
223 changes: 223 additions & 0 deletions markdown_generator/PubsFromBib.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Publications markdown generator for academicpages\n",
"\n",
"Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). \n",
"\n",
"The core python code is also in `pubsFromBibs.py`. \n",
"Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:\n",
"* bib file names\n",
"* specific venue keys based on your bib file preferences\n",
"* any specific pre-text for specific files\n",
"* Collection Name (future feature)\n",
"\n",
"TODO: Make this work with other databases of citations, \n",
"TODO: Merge this with the existing TSV parsing solution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pybtex.database.input import bibtex\n",
"import pybtex.database.input.bibtex \n",
"from time import strptime\n",
"import string\n",
"import html\n",
"import os\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#todo: incorporate different collection types rather than a catch all publications, requires other changes to template\n",
"publist = {\n",
" \"proceeding\": {\n",
" \"file\" : \"proceedings.bib\",\n",
" \"venuekey\": \"booktitle\",\n",
" \"venue-pretext\": \"In the proceedings of \",\n",
" \"collection\" : {\"name\":\"publications\",\n",
" \"permalink\":\"/publication/\"}\n",
" \n",
" },\n",
" \"journal\":{\n",
" \"file\": \"pubs.bib\",\n",
" \"venuekey\" : \"journal\",\n",
" \"venue-pretext\" : \"\",\n",
" \"collection\" : {\"name\":\"publications\",\n",
" \"permalink\":\"/publication/\"}\n",
" } \n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"html_escape_table = {\n",
" \"&\": \"&\",\n",
" '\"': \""\",\n",
" \"'\": \"'\"\n",
" }\n",
"\n",
"def html_escape(text):\n",
" \"\"\"Produce entities within text.\"\"\"\n",
" return \"\".join(html_escape_table.get(c,c) for c in text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"for pubsource in publist:\n",
" parser = bibtex.Parser()\n",
" bibdata = parser.parse_file(publist[pubsource][\"file\"])\n",
"\n",
" #loop through the individual references in a given bibtex file\n",
" for bib_id in bibdata.entries:\n",
" #reset default date\n",
" pub_year = \"1900\"\n",
" pub_month = \"01\"\n",
" pub_day = \"01\"\n",
" \n",
" b = bibdata.entries[bib_id].fields\n",
" \n",
" try:\n",
" pub_year = f'{b[\"year\"]}'\n",
"\n",
" #todo: this hack for month and day needs some cleanup\n",
" if \"month\" in b.keys(): \n",
" if(len(b[\"month\"])<3):\n",
" pub_month = \"0\"+b[\"month\"]\n",
" pub_month = pub_month[-2:]\n",
" elif(b[\"month\"] not in range(12)):\n",
" tmnth = strptime(b[\"month\"][:3],'%b').tm_mon \n",
" pub_month = \"{:02d}\".format(tmnth) \n",
" else:\n",
" pub_month = str(b[\"month\"])\n",
" if \"day\" in b.keys(): \n",
" pub_day = str(b[\"day\"])\n",
"\n",
" \n",
" pub_date = pub_year+\"-\"+pub_month+\"-\"+pub_day\n",
" \n",
" #strip out {} as needed (some bibtex entries that maintain formatting)\n",
" clean_title = b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\").replace(\" \",\"-\") \n",
"\n",
" url_slug = re.sub(\"\\\\[.*\\\\]|[^a-zA-Z0-9_-]\", \"\", clean_title)\n",
" url_slug = url_slug.replace(\"--\",\"-\")\n",
"\n",
" md_filename = (str(pub_date) + \"-\" + url_slug + \".md\").replace(\"--\",\"-\")\n",
" html_filename = (str(pub_date) + \"-\" + url_slug).replace(\"--\",\"-\")\n",
"\n",
" #Build Citation from text\n",
" citation = \"\"\n",
"\n",
" #citation authors - todo - add highlighting for primary author?\n",
" for author in bibdata.entries[bib_id].persons[\"author\"]:\n",
" citation = citation+\" \"+author.first_names[0]+\" \"+author.last_names[0]+\", \"\n",
"\n",
" #citation title\n",
" citation = citation + \"\\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + \".\\\"\"\n",
"\n",
" #add venue logic depending on citation type\n",
" venue = publist[pubsource][\"venue-pretext\"]+b[publist[pubsource][\"venuekey\"]].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")\n",
"\n",
" citation = citation + \" \" + html_escape(venue)\n",
" citation = citation + \", \" + pub_year + \".\"\n",
"\n",
" \n",
" ## YAML variables\n",
" md = \"---\\ntitle: \\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + '\"\\n'\n",
" \n",
" md += \"\"\"collection: \"\"\" + publist[pubsource][\"collection\"][\"name\"]\n",
"\n",
" md += \"\"\"\\npermalink: \"\"\" + publist[pubsource][\"collection\"][\"permalink\"] + html_filename\n",
" \n",
" note = False\n",
" if \"note\" in b.keys():\n",
" if len(str(b[\"note\"])) > 5:\n",
" md += \"\\nexcerpt: '\" + html_escape(b[\"note\"]) + \"'\"\n",
" note = True\n",
"\n",
" md += \"\\ndate: \" + str(pub_date) \n",
"\n",
" md += \"\\nvenue: '\" + html_escape(venue) + \"'\"\n",
" \n",
" url = False\n",
" if \"url\" in b.keys():\n",
" if len(str(b[\"url\"])) > 5:\n",
" md += \"\\npaperurl: '\" + b[\"url\"] + \"'\"\n",
" url = True\n",
"\n",
" md += \"\\ncitation: '\" + html_escape(citation) + \"'\"\n",
"\n",
" md += \"\\n---\"\n",
"\n",
" \n",
" ## Markdown description for individual page\n",
" if note:\n",
" md += \"\\n\" + html_escape(b[\"note\"]) + \"\\n\"\n",
"\n",
" if url:\n",
" md += \"\\n[Access paper here](\" + b[\"url\"] + \"){:target=\\\"_blank\\\"}\\n\" \n",
" else:\n",
" md += \"\\nUse [Google Scholar](https://scholar.google.com/scholar?q=\"+html.escape(clean_title.replace(\"-\",\"+\"))+\"){:target=\\\"_blank\\\"} for full citation\"\n",
"\n",
" md_filename = os.path.basename(md_filename)\n",
"\n",
" with open(\"../_publications/\" + md_filename, 'w') as f:\n",
" f.write(md)\n",
" print(f'SUCESSFULLY PARSED {bib_id}: \\\"', b[\"title\"][:60],\"...\"*(len(b['title'])>60),\"\\\"\")\n",
" # field may not exist for a reference\n",
" except KeyError as e:\n",
" print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \\\"', b[\"title\"][:30],\"...\"*(len(b['title'])>30),\"\\\"\")\n",
" continue\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
160 changes: 160 additions & 0 deletions markdown_generator/pubsFromBib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#!/usr/bin/env python
# coding: utf-8

# # Publications markdown generator for academicpages
#
# Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)).
#
# The core python code is also in `pubsFromBibs.py`.
# Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
# * bib file names
# * specific venue keys based on your bib file preferences
# * any specific pre-text for specific files
# * Collection Name (future feature)
#
# TODO: Make this work with other databases of citations,
# TODO: Merge this with the existing TSV parsing solution


from pybtex.database.input import bibtex
import pybtex.database.input.bibtex
from time import strptime
import string
import html
import os
import re

#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
publist = {
"proceeding": {
"file" : "proceedings.bib",
"venuekey": "booktitle",
"venue-pretext": "In the proceedings of ",
"collection" : {"name":"publications",
"permalink":"/publication/"}

},
"journal":{
"file": "pubs.bib",
"venuekey" : "journal",
"venue-pretext" : "",
"collection" : {"name":"publications",
"permalink":"/publication/"}
}
}

html_escape_table = {
"&": "&amp;",
'"': "&quot;",
"'": "&apos;"
}

def html_escape(text):
"""Produce entities within text."""
return "".join(html_escape_table.get(c,c) for c in text)


for pubsource in publist:
parser = bibtex.Parser()
bibdata = parser.parse_file(publist[pubsource]["file"])

#loop through the individual references in a given bibtex file
for bib_id in bibdata.entries:
#reset default date
pub_year = "1900"
pub_month = "01"
pub_day = "01"

b = bibdata.entries[bib_id].fields

try:
pub_year = f'{b["year"]}'

#todo: this hack for month and day needs some cleanup
if "month" in b.keys():
if(len(b["month"])<3):
pub_month = "0"+b["month"]
pub_month = pub_month[-2:]
elif(b["month"] not in range(12)):
tmnth = strptime(b["month"][:3],'%b').tm_mon
pub_month = "{:02d}".format(tmnth)
else:
pub_month = str(b["month"])
if "day" in b.keys():
pub_day = str(b["day"])


pub_date = pub_year+"-"+pub_month+"-"+pub_day

#strip out {} as needed (some bibtex entries that maintain formatting)
clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")

url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
url_slug = url_slug.replace("--","-")

md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")

#Build Citation from text
citation = ""

#citation authors - todo - add highlighting for primary author?
for author in bibdata.entries[bib_id].persons["author"]:
citation = citation+" "+author.first_names[0]+" "+author.last_names[0]+", "

#citation title
citation = citation + "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""

#add venue logic depending on citation type
venue = publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","")

citation = citation + " " + html_escape(venue)
citation = citation + ", " + pub_year + "."


## YAML variables
md = "---\ntitle: \"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'

md += """collection: """ + publist[pubsource]["collection"]["name"]

md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"] + html_filename

note = False
if "note" in b.keys():
if len(str(b["note"])) > 5:
md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
note = True

md += "\ndate: " + str(pub_date)

md += "\nvenue: '" + html_escape(venue) + "'"

url = False
if "url" in b.keys():
if len(str(b["url"])) > 5:
md += "\npaperurl: '" + b["url"] + "'"
url = True

md += "\ncitation: '" + html_escape(citation) + "'"

md += "\n---"


## Markdown description for individual page
if note:
md += "\n" + html_escape(b["note"]) + "\n"

if url:
md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n"
else:
md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"

md_filename = os.path.basename(md_filename)

with open("../_publications/" + md_filename, 'w') as f:
f.write(md)
print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
# field may not exist for a reference
except KeyError as e:
print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
continue

0 comments on commit de37ec3

Please sign in to comment.