Merge pull request #33 from Plikt/main

Making orcid2 work
DeSci-md · Apr 29, 2024 · 43f4cf1 · 43f4cf1
2 parents 489b7b1 + 782ab53
commit 43f4cf1
Show file tree

Hide file tree

Showing 8 changed files with 1,022 additions and 127 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/DOCKERFILE b/DOCKERFILE
@@ -10,5 +10,9 @@ COPY app/ /usr/src/app/
 # Install dependencies
 RUN pip install -r requirements.txt
 
+#run service - Expose (what is the request response model)
+EXPOSE 5001
+
 # Define the command to run when the container starts
-CMD ["python", "langchain_api.py"]
+CMD ["flask", "run", "--host=0.0.0.0"]
+#CMD ["python", "langchain_orcid.py"]
diff --git a/app/__pycache__/langchain_api.cpython-310.pyc b/app/__pycache__/langchain_api.cpython-310.pyc
diff --git a/app/langchain_api.py b/app/langchain_api.py
@@ -36,9 +36,6 @@
 import tiktoken
 #from demo import read_single 
 
-#TODO: IF doi -> then search open alex -> determine relevant metadata to return. -> Together once everything is up to date. 
-#TODO: get api + langchain + sturcutred output in a pretty package -> Ellie
-
 #from ..Server.PDFDataExtractor.pdfdataextractor.demo import read_single
 sys.path.append(os.path.abspath("/Users/desot1/Dev/automating-metadata/Server/PDFDataExtractor/pdfdataextractor"))
 pyalex.config.email = "ellie@desci.com"
@@ -143,10 +140,6 @@ def paper_data_json_single(doi):
     cr.mailto = 'desotaelianna@gmail.com'
     cr.ua_string = 'Python/Flask script for use in Desci Nodes publication information retrieval.'
 
-    # Elsevier API key
-    apikey = os.getenv("apikey")
-    client = httpx.Client()
-
 
     #%% Info from Crossref
     try:
@@ -158,7 +151,10 @@ def paper_data_json_single(doi):
         title = r['message']['title'][0]
     except:
         title = f"None, Crossref Error"
-
+    try:
+        abstract = r['message']['abstract'][0]
+    except:
+        abstract = f"None, Crossref Error"
     try:
         type = r['message']['type']
     except:
@@ -178,26 +174,31 @@ def paper_data_json_single(doi):
         subject = r['message']['subject']
     except:
         subject = "None, Crossref Error"
+    try:
+        subject = r['message']['license']
+    except:
+        subject = "None, Crossref Error"
 
     authors_info = {}
-    inst_names = {}  # handling multiple colleges, universities
 
     for i in r['message']['author']:
         author_name = i['given'] + ' ' + i['family']
+        author_info = {'affiliation': None, 'orcid': None}
+
         try:
-            institution = i['affiliation'][0]['name']
-            #if institution not in inst_names:
-                #inst_names.append(institution)
-        except:
-            institution = None
+            author_info['affiliation'] = i['affiliation'][0]['name']
+        except (KeyError, IndexError):
+            pass
 
-        if institution:
-            authors_info[author_name] = institution
-        else:
-            authors_info[author_name] = "None"
+        try:
+            author_info['orcid'] = i['ORCID']
+        except KeyError:
+            pass
+
+        authors_info[author_name] = author_info
 
     if not authors_info:
-        authors_info["None"] = "None, no institutions returned by CrossRef"
+        authors_info["None"] = {'affiliation': 'None, no authors returned by CrossRef', 'orcid': 'None'}
 
 
     refs = []
@@ -209,39 +210,6 @@ def paper_data_json_single(doi):
 
     url_link = r['message']['URL']
 
-
-    #%% Info from Elsevier
-    format = 'application/json'
-    view ="FULL"
-    url = f"https://api.elsevier.com/content/article/doi/{doi}?APIKey={apikey}&httpAccept={format}&view={view}"
-    with httpx.Client() as client:
-        r=client.get(url)
-
-    json_string = r.text
-    d = json.loads(json_string)  # "d" for dictionary
-
-    try:
-        scopus_id = d['full-text-retrieval-response']['scopus-id']
-    except:
-        scopus_id = 'None, elsevier error'
-
-    try:
-        abstract = d['full-text-retrieval-response']['coredata']['dc:description']
-    except:
-        abstract = 'None, elsevier error'
-
-    try:
-        keywords = []
-        for i in d['full-text-retrieval-response']['coredata']['dcterms:subject']:
-            keywords.append(i['$'])
-
-    except:
-        keywords = ['None, elsevier error']
-
-    try:
-        original_text = d['full-text-retrieval-response']['originalText']
-    except:
-        original_text = 'None, elsevier error'
 
 
     #%% Info from Semantic Scholar
@@ -284,66 +252,62 @@ def paper_data_json_single(doi):
         openaccess_pdf = "None, Semantic Scholar lookup error"
 
     # OpenAlex accessing as backup info for the previous tools
+    openalex=True
     try:
         openalex_results = Works()[doi]  # Crossref search using DOI, "r" for request
     except requests.exceptions.HTTPError as e:
         print(f"OpenAlex DOI lookup returned error: {e}\n")
+        openalex = False
 
+    if openalex: 
+
+        if "error" in title:  # attempt replacing error title from cross with title from openalex
+            try:
+                title = openalex_results['title']
+            except:
+                pass
+        if "error" in type:  # attempt replacing error keywords from cross with title from openalex
+            try:
+                type = openalex_results['type']
+            except:
+                pass
+        if "error" in pub_name:  # attempt replacing error keywords from cross with title from openalex
+            try:
+                pub_name = openalex_results['primary_location']
+            except:
+                pass
+
+        if "error" in pub_date:  # attempt replacing error keywords from cross with title from openalex
+            try:
+                pub_date = openalex_results['publication_date']
+            except:
+                pass
+
     try:
         openalex_id = openalex_results['id']
     except: 
         openalex_id = "None, OpenAlex Lookup error"
-
-    if "error" in title:  # attempt replacing error title from cross with title from openalex
-        try:
-            title = openalex_results['title']
-        except:
-            pass
-    if "error" in type:  # attempt replacing error keywords from cross with title from openalex
-        try:
-            type = openalex_results['type']
-        except:
-            pass
-    if "error" in pub_name:  # attempt replacing error keywords from cross with title from openalex
-        try:
-            pub_name = openalex_results['primary_location']
-        except:
-            pass
-
-    if "error" in pub_date:  # attempt replacing error keywords from cross with title from openalex
-        try:
-            pub_date = openalex_results['publication_date']
-        except:
-            pass
-
-    if "error" in keywords:  # attempt replacing error keywords from cross with title from openalex
-        try:
-            title = openalex_results['title']
-        except:
-            pass
-
     try:
-        openaccess = openalex_results['open_access']
+        keywords = openalex_results['keywords']
     except:
-        pass
+        keywords = "none"
+
+
     #%% Constructing output dictionary
     output_dict = {
         # Paper Metadata
         'title':title,
         'authors':authors_info,
         'abstract':abstract,
-        'scopus_id':scopus_id,
         'paperId':paper_id,
         'publisher':pub_name,
         'datePublished':pub_date,
         'type':type,
         'keywords':keywords,
         'about':subject,
         'fields_of_study':field_of_study,
-        'institution_names':inst_names,
         'references':refs,
         'tldr':tldr,
-        'original_text':original_text,
         'openAccessPdf':openaccess_pdf,
         'URL_link': url_link,
         'openalex_id': openalex_id,
@@ -438,53 +402,35 @@ def get_orcid(authors):
 
     for author in authors: 
         try: 
-            url = "https://api.openalex.org/autocomplete/authors?q=" + author
+            url = "https://api.openalex.org/authors?search=" + author
             response = json.loads(requests.get(url).text)
         except Exception as e:  # Added variable 'e' to catch the exception
             print(f"OpenAlex ORCID lookup returned error: {e}\n")
             continue  # Skip to the next author
 
-        if response["meta"]["count"] == 1: 
-            orcid = response["results"][0]["external_id"]
-            affiliation = response["results"][0]["hint"]
-        elif response["meta"]["count"] == 0:
+        #print(response)
+        if response["meta"]["count"] >= 1:
+            orcid = response["results"][0]["orcid"]
+            print(orcid)
+            affiliation = response["results"][0]["affiliations"][0]["institution"]["display_name"]
+            display_name = response["results"][0]["display_name"]  # Updated to use display_name
+
+            author_info = {
+                "@id": f"https://orcid.org/{orcid}",
+                "role": "Person",
+                "affiliation": affiliation,
+                "name": display_name
+            }
+
+            orcid_info[author] = author_info 
+
+        else:
             print("None, There are no OrcID suggestions for this author")
+            orcid_info[author] = "none"
             continue  # Skip to the next author
-        else: 
-            orcid = response["results"][0]["external_id"]
-            affiliation = response["results"][0]["hint"]
-
-        author_info = {
-            "orcid": orcid,
-            "affiliation": affiliation
-        }
-
-        orcid_info[author] = author_info
 
     return orcid_info
 
-#def get_orcid(authors): 
-    orcid = []
-    author_info = {}   
-
-    for author in authors: 
-        try: 
-            url = "https://api.openalex.org/autocomplete/authors?q=" + author
-            response = json.loads(requests.get(url).text)
-        except: 
-            print(f"OpenAlex ORCID lookup returned error: {e}\n")
-
-        if response["meta"]["count"] == 1: 
-            orcid = response["results"][0]["external_id"]
-            author_info[author] = {"orcid": orcid, "affiliation":response["results"][0]["hint"]}
-        elif response["meta"]["count"] == 0: #FAKE - Create a test so we can check if the return is valid. 
-            print("None, There are no OrcID suggestions for this author")
-        else: 
-            orcid = response["results"][0]["external_id"]
-            author_info[author] = {"orcid": orcid, "affiliation": response["results"][0]["hint"]}
-            #create an async function which ranks the authors based on the similarity to the paper. 
-
-    return author_info
 
 def check_item_filled(json_ld, name):
     for item in json_ld["@graph"]:
@@ -511,7 +457,7 @@ def update_json_ld(json_ld, new_data):
                 if author_info is dict: 
                     orchid = author_info.get("orcid")
                     organization = author_info.get("affiliation")
-                    
+
                     if orchid:
                         creator_entry["@id"] = orchid
                     if organization:
@@ -529,11 +475,11 @@ def update_json_ld(json_ld, new_data):
 
 
 #%% Main, general case for testing
-if __name__ == "__main__":
+"""if __name__ == "__main__":
     print("Starting code run...")
 
     node = "46" #os.getenv('NODE_ENV')
-    DOI_env = "10.3847/0004-637X/828/1/46" #os.getenv('DOI_ENV')
+    DOI_env = "10.3847/0004-637X/828/1/46"#os.getenv('DOI_ENV') #
     
     if node is not None:
         print(f"NODE_ENVIRONMENT is set to: {node}")
@@ -557,3 +503,36 @@ def update_json_ld(json_ld, new_data):
     print(updated_json_ld)
 
     print("Script completed")
+"""
+def run(node, doi=None): 
+    print("Starting code run...")
+
+    #node = "46" #os.getenv('NODE_ENV')
+    #DOI_env = "10.3847/0004-637X/828/1/46"#os.getenv('DOI_ENV') #
+
+    if node is not None:
+        print(f"NODE_ENVIRONMENT is set to: {node}")
+    else:
+        print("NODE_ENVIRONMENT is not set.")
+
+    json_ld = get_jsonld(node)
+    print(json_ld)
+
+    if doi: 
+        lookup_results = paper_data_json_single(doi)
+        #updated_json_ld = update_json_ld(json_ld, lookup_results)
+
+    else: 
+        updated_json_ld = json_ld
+
+    llm_output = asyncio.run(langchain_paper_search(node))# output of unstructured text in dictionary
+    #updated_json_ld = update_json_ld(json_ld, llm_output)
+    updated_json_ld = json_ld
+    #doi = "https://doi.org/10.1002/adma.202208113"
+
+    #print(updated_json_ld)
+
+    print("Script completed")
+
+if __name__ == "__main__":
+ run("46", "https://doi.org/10.1002/adma.202208113")