Merge pull request academicpages#197 from mborowczak/pubsfrombib

Ability to generate publications from bibtex files for academicpages#2
cjd04 · Feb 19, 2019 · de37ec3 · de37ec3
2 parents 1859ec1 + 9458158
commit de37ec3
Show file tree

Hide file tree

Showing 2 changed files with 383 additions and 0 deletions.
diff --git a/markdown_generator/PubsFromBib.ipynb b/markdown_generator/PubsFromBib.ipynb
@@ -0,0 +1,223 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Publications markdown generator for academicpages\n",
+    "\n",
+    "Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). \n",
+    "\n",
+    "The core python code is also in `pubsFromBibs.py`. \n",
+    "Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:\n",
+    "* bib file names\n",
+    "* specific venue keys based on your bib file preferences\n",
+    "* any specific pre-text for specific files\n",
+    "* Collection Name (future feature)\n",
+    "\n",
+    "TODO: Make this work with other databases of citations, \n",
+    "TODO: Merge this with the existing TSV parsing solution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pybtex.database.input import bibtex\n",
+    "import pybtex.database.input.bibtex \n",
+    "from time import strptime\n",
+    "import string\n",
+    "import html\n",
+    "import os\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#todo: incorporate different collection types rather than a catch all publications, requires other changes to template\n",
+    "publist = {\n",
+    "    \"proceeding\": {\n",
+    "        \"file\" : \"proceedings.bib\",\n",
+    "        \"venuekey\": \"booktitle\",\n",
+    "        \"venue-pretext\": \"In the proceedings of \",\n",
+    "        \"collection\" : {\"name\":\"publications\",\n",
+    "                        \"permalink\":\"/publication/\"}\n",
+    "        \n",
+    "    },\n",
+    "    \"journal\":{\n",
+    "        \"file\": \"pubs.bib\",\n",
+    "        \"venuekey\" : \"journal\",\n",
+    "        \"venue-pretext\" : \"\",\n",
+    "        \"collection\" : {\"name\":\"publications\",\n",
+    "                        \"permalink\":\"/publication/\"}\n",
+    "    } \n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "html_escape_table = {\n",
+    "    \"&\": \"&amp;\",\n",
+    "    '\"': \"&quot;\",\n",
+    "    \"'\": \"&apos;\"\n",
+    "    }\n",
+    "\n",
+    "def html_escape(text):\n",
+    "    \"\"\"Produce entities within text.\"\"\"\n",
+    "    return \"\".join(html_escape_table.get(c,c) for c in text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "for pubsource in publist:\n",
+    "    parser = bibtex.Parser()\n",
+    "    bibdata = parser.parse_file(publist[pubsource][\"file\"])\n",
+    "\n",
+    "    #loop through the individual references in a given bibtex file\n",
+    "    for bib_id in bibdata.entries:\n",
+    "        #reset default date\n",
+    "        pub_year = \"1900\"\n",
+    "        pub_month = \"01\"\n",
+    "        pub_day = \"01\"\n",
+    "        \n",
+    "        b = bibdata.entries[bib_id].fields\n",
+    "        \n",
+    "        try:\n",
+    "            pub_year = f'{b[\"year\"]}'\n",
+    "\n",
+    "            #todo: this hack for month and day needs some cleanup\n",
+    "            if \"month\" in b.keys(): \n",
+    "                if(len(b[\"month\"])<3):\n",
+    "                    pub_month = \"0\"+b[\"month\"]\n",
+    "                    pub_month = pub_month[-2:]\n",
+    "                elif(b[\"month\"] not in range(12)):\n",
+    "                    tmnth = strptime(b[\"month\"][:3],'%b').tm_mon   \n",
+    "                    pub_month = \"{:02d}\".format(tmnth) \n",
+    "                else:\n",
+    "                    pub_month = str(b[\"month\"])\n",
+    "            if \"day\" in b.keys(): \n",
+    "                pub_day = str(b[\"day\"])\n",
+    "\n",
+    "                \n",
+    "            pub_date = pub_year+\"-\"+pub_month+\"-\"+pub_day\n",
+    "            \n",
+    "            #strip out {} as needed (some bibtex entries that maintain formatting)\n",
+    "            clean_title = b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\").replace(\" \",\"-\")    \n",
+    "\n",
+    "            url_slug = re.sub(\"\\\\[.*\\\\]|[^a-zA-Z0-9_-]\", \"\", clean_title)\n",
+    "            url_slug = url_slug.replace(\"--\",\"-\")\n",
+    "\n",
+    "            md_filename = (str(pub_date) + \"-\" + url_slug + \".md\").replace(\"--\",\"-\")\n",
+    "            html_filename = (str(pub_date) + \"-\" + url_slug).replace(\"--\",\"-\")\n",
+    "\n",
+    "            #Build Citation from text\n",
+    "            citation = \"\"\n",
+    "\n",
+    "            #citation authors - todo - add highlighting for primary author?\n",
+    "            for author in bibdata.entries[bib_id].persons[\"author\"]:\n",
+    "                citation = citation+\" \"+author.first_names[0]+\" \"+author.last_names[0]+\", \"\n",
+    "\n",
+    "            #citation title\n",
+    "            citation = citation + \"\\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + \".\\\"\"\n",
+    "\n",
+    "            #add venue logic depending on citation type\n",
+    "            venue = publist[pubsource][\"venue-pretext\"]+b[publist[pubsource][\"venuekey\"]].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")\n",
+    "\n",
+    "            citation = citation + \" \" + html_escape(venue)\n",
+    "            citation = citation + \", \" + pub_year + \".\"\n",
+    "\n",
+    "            \n",
+    "            ## YAML variables\n",
+    "            md = \"---\\ntitle: \\\"\"   + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + '\"\\n'\n",
+    "            \n",
+    "            md += \"\"\"collection: \"\"\" +  publist[pubsource][\"collection\"][\"name\"]\n",
+    "\n",
+    "            md += \"\"\"\\npermalink: \"\"\" + publist[pubsource][\"collection\"][\"permalink\"]  + html_filename\n",
+    "            \n",
+    "            note = False\n",
+    "            if \"note\" in b.keys():\n",
+    "                if len(str(b[\"note\"])) > 5:\n",
+    "                    md += \"\\nexcerpt: '\" + html_escape(b[\"note\"]) + \"'\"\n",
+    "                    note = True\n",
+    "\n",
+    "            md += \"\\ndate: \" + str(pub_date) \n",
+    "\n",
+    "            md += \"\\nvenue: '\" + html_escape(venue) + \"'\"\n",
+    "            \n",
+    "            url = False\n",
+    "            if \"url\" in b.keys():\n",
+    "                if len(str(b[\"url\"])) > 5:\n",
+    "                    md += \"\\npaperurl: '\" + b[\"url\"] + \"'\"\n",
+    "                    url = True\n",
+    "\n",
+    "            md += \"\\ncitation: '\" + html_escape(citation) + \"'\"\n",
+    "\n",
+    "            md += \"\\n---\"\n",
+    "\n",
+    "            \n",
+    "            ## Markdown description for individual page\n",
+    "            if note:\n",
+    "                md += \"\\n\" + html_escape(b[\"note\"]) + \"\\n\"\n",
+    "\n",
+    "            if url:\n",
+    "                md += \"\\n[Access paper here](\" + b[\"url\"] + \"){:target=\\\"_blank\\\"}\\n\" \n",
+    "            else:\n",
+    "                md += \"\\nUse [Google Scholar](https://scholar.google.com/scholar?q=\"+html.escape(clean_title.replace(\"-\",\"+\"))+\"){:target=\\\"_blank\\\"} for full citation\"\n",
+    "\n",
+    "            md_filename = os.path.basename(md_filename)\n",
+    "\n",
+    "            with open(\"../_publications/\" + md_filename, 'w') as f:\n",
+    "                f.write(md)\n",
+    "            print(f'SUCESSFULLY PARSED {bib_id}: \\\"', b[\"title\"][:60],\"...\"*(len(b['title'])>60),\"\\\"\")\n",
+    "        # field may not exist for a reference\n",
+    "        except KeyError as e:\n",
+    "            print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \\\"', b[\"title\"][:30],\"...\"*(len(b['title'])>30),\"\\\"\")\n",
+    "            continue\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/markdown_generator/pubsFromBib.py b/markdown_generator/pubsFromBib.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # Publications markdown generator for academicpages
+# 
+# Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). 
+# 
+# The core python code is also in `pubsFromBibs.py`. 
+# Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
+# * bib file names
+# * specific venue keys based on your bib file preferences
+# * any specific pre-text for specific files
+# * Collection Name (future feature)
+# 
+# TODO: Make this work with other databases of citations, 
+# TODO: Merge this with the existing TSV parsing solution
+
+
+from pybtex.database.input import bibtex
+import pybtex.database.input.bibtex 
+from time import strptime
+import string
+import html
+import os
+import re
+
+#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
+publist = {
+    "proceeding": {
+        "file" : "proceedings.bib",
+        "venuekey": "booktitle",
+        "venue-pretext": "In the proceedings of ",
+        "collection" : {"name":"publications",
+                        "permalink":"/publication/"}
+
+    },
+    "journal":{
+        "file": "pubs.bib",
+        "venuekey" : "journal",
+        "venue-pretext" : "",
+        "collection" : {"name":"publications",
+                        "permalink":"/publication/"}
+    } 
+}
+
+html_escape_table = {
+    "&": "&amp;",
+    '"': "&quot;",
+    "'": "&apos;"
+    }
+
+def html_escape(text):
+    """Produce entities within text."""
+    return "".join(html_escape_table.get(c,c) for c in text)
+
+
+for pubsource in publist:
+    parser = bibtex.Parser()
+    bibdata = parser.parse_file(publist[pubsource]["file"])
+
+    #loop through the individual references in a given bibtex file
+    for bib_id in bibdata.entries:
+        #reset default date
+        pub_year = "1900"
+        pub_month = "01"
+        pub_day = "01"
+
+        b = bibdata.entries[bib_id].fields
+
+        try:
+            pub_year = f'{b["year"]}'
+
+            #todo: this hack for month and day needs some cleanup
+            if "month" in b.keys(): 
+                if(len(b["month"])<3):
+                    pub_month = "0"+b["month"]
+                    pub_month = pub_month[-2:]
+                elif(b["month"] not in range(12)):
+                    tmnth = strptime(b["month"][:3],'%b').tm_mon   
+                    pub_month = "{:02d}".format(tmnth) 
+                else:
+                    pub_month = str(b["month"])
+            if "day" in b.keys(): 
+                pub_day = str(b["day"])
+
+
+            pub_date = pub_year+"-"+pub_month+"-"+pub_day
+
+            #strip out {} as needed (some bibtex entries that maintain formatting)
+            clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")    
+
+            url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
+            url_slug = url_slug.replace("--","-")
+
+            md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
+            html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")
+
+            #Build Citation from text
+            citation = ""
+
+            #citation authors - todo - add highlighting for primary author?
+            for author in bibdata.entries[bib_id].persons["author"]:
+                citation = citation+" "+author.first_names[0]+" "+author.last_names[0]+", "
+
+            #citation title
+            citation = citation + "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""
+
+            #add venue logic depending on citation type
+            venue = publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","")
+
+            citation = citation + " " + html_escape(venue)
+            citation = citation + ", " + pub_year + "."
+
+
+            ## YAML variables
+            md = "---\ntitle: \""   + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'
+
+            md += """collection: """ +  publist[pubsource]["collection"]["name"]
+
+            md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"]  + html_filename
+
+            note = False
+            if "note" in b.keys():
+                if len(str(b["note"])) > 5:
+                    md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
+                    note = True
+
+            md += "\ndate: " + str(pub_date) 
+
+            md += "\nvenue: '" + html_escape(venue) + "'"
+
+            url = False
+            if "url" in b.keys():
+                if len(str(b["url"])) > 5:
+                    md += "\npaperurl: '" + b["url"] + "'"
+                    url = True
+
+            md += "\ncitation: '" + html_escape(citation) + "'"
+
+            md += "\n---"
+
+
+            ## Markdown description for individual page
+            if note:
+                md += "\n" + html_escape(b["note"]) + "\n"
+
+            if url:
+                md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n" 
+            else:
+                md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"
+
+            md_filename = os.path.basename(md_filename)
+
+            with open("../_publications/" + md_filename, 'w') as f:
+                f.write(md)
+            print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
+        # field may not exist for a reference
+        except KeyError as e:
+            print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
+            continue