Skip to content

Commit

Permalink
Merge pull request #33 from Plikt/main
Browse files Browse the repository at this point in the history
Making orcid2 work
  • Loading branch information
Plikt authored Apr 29, 2024
2 parents 489b7b1 + 782ab53 commit 43f4cf1
Show file tree
Hide file tree
Showing 8 changed files with 1,022 additions and 127 deletions.
Binary file modified .DS_Store
Binary file not shown.
6 changes: 5 additions & 1 deletion DOCKERFILE
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,9 @@ COPY app/ /usr/src/app/
# Install dependencies
RUN pip install -r requirements.txt

#run service - Expose (what is the request response model)
EXPOSE 5001

# Define the command to run when the container starts
CMD ["python", "langchain_api.py"]
CMD ["flask", "run", "--host=0.0.0.0"]
#CMD ["python", "langchain_orcid.py"]
Binary file modified app/__pycache__/langchain_api.cpython-310.pyc
Binary file not shown.
231 changes: 105 additions & 126 deletions app/langchain_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@
import tiktoken
#from demo import read_single

#TODO: IF doi -> then search open alex -> determine relevant metadata to return. -> Together once everything is up to date.
#TODO: get api + langchain + sturcutred output in a pretty package -> Ellie

#from ..Server.PDFDataExtractor.pdfdataextractor.demo import read_single
sys.path.append(os.path.abspath("/Users/desot1/Dev/automating-metadata/Server/PDFDataExtractor/pdfdataextractor"))
pyalex.config.email = "ellie@desci.com"
Expand Down Expand Up @@ -143,10 +140,6 @@ def paper_data_json_single(doi):
cr.mailto = 'desotaelianna@gmail.com'
cr.ua_string = 'Python/Flask script for use in Desci Nodes publication information retrieval.'

# Elsevier API key
apikey = os.getenv("apikey")
client = httpx.Client()


#%% Info from Crossref
try:
Expand All @@ -158,7 +151,10 @@ def paper_data_json_single(doi):
title = r['message']['title'][0]
except:
title = f"None, Crossref Error"

try:
abstract = r['message']['abstract'][0]
except:
abstract = f"None, Crossref Error"
try:
type = r['message']['type']
except:
Expand All @@ -178,26 +174,31 @@ def paper_data_json_single(doi):
subject = r['message']['subject']
except:
subject = "None, Crossref Error"
try:
subject = r['message']['license']
except:
subject = "None, Crossref Error"

authors_info = {}
inst_names = {} # handling multiple colleges, universities

for i in r['message']['author']:
author_name = i['given'] + ' ' + i['family']
author_info = {'affiliation': None, 'orcid': None}

try:
institution = i['affiliation'][0]['name']
#if institution not in inst_names:
#inst_names.append(institution)
except:
institution = None
author_info['affiliation'] = i['affiliation'][0]['name']
except (KeyError, IndexError):
pass

if institution:
authors_info[author_name] = institution
else:
authors_info[author_name] = "None"
try:
author_info['orcid'] = i['ORCID']
except KeyError:
pass

authors_info[author_name] = author_info

if not authors_info:
authors_info["None"] = "None, no institutions returned by CrossRef"
authors_info["None"] = {'affiliation': 'None, no authors returned by CrossRef', 'orcid': 'None'}


refs = []
Expand All @@ -209,39 +210,6 @@ def paper_data_json_single(doi):

url_link = r['message']['URL']


#%% Info from Elsevier
format = 'application/json'
view ="FULL"
url = f"https://api.elsevier.com/content/article/doi/{doi}?APIKey={apikey}&httpAccept={format}&view={view}"
with httpx.Client() as client:
r=client.get(url)

json_string = r.text
d = json.loads(json_string) # "d" for dictionary

try:
scopus_id = d['full-text-retrieval-response']['scopus-id']
except:
scopus_id = 'None, elsevier error'

try:
abstract = d['full-text-retrieval-response']['coredata']['dc:description']
except:
abstract = 'None, elsevier error'

try:
keywords = []
for i in d['full-text-retrieval-response']['coredata']['dcterms:subject']:
keywords.append(i['$'])

except:
keywords = ['None, elsevier error']

try:
original_text = d['full-text-retrieval-response']['originalText']
except:
original_text = 'None, elsevier error'


#%% Info from Semantic Scholar
Expand Down Expand Up @@ -284,66 +252,62 @@ def paper_data_json_single(doi):
openaccess_pdf = "None, Semantic Scholar lookup error"

# OpenAlex accessing as backup info for the previous tools
openalex=True
try:
openalex_results = Works()[doi] # Crossref search using DOI, "r" for request
except requests.exceptions.HTTPError as e:
print(f"OpenAlex DOI lookup returned error: {e}\n")
openalex = False

if openalex:

if "error" in title: # attempt replacing error title from cross with title from openalex
try:
title = openalex_results['title']
except:
pass
if "error" in type: # attempt replacing error keywords from cross with title from openalex
try:
type = openalex_results['type']
except:
pass
if "error" in pub_name: # attempt replacing error keywords from cross with title from openalex
try:
pub_name = openalex_results['primary_location']
except:
pass

if "error" in pub_date: # attempt replacing error keywords from cross with title from openalex
try:
pub_date = openalex_results['publication_date']
except:
pass

try:
openalex_id = openalex_results['id']
except:
openalex_id = "None, OpenAlex Lookup error"

if "error" in title: # attempt replacing error title from cross with title from openalex
try:
title = openalex_results['title']
except:
pass
if "error" in type: # attempt replacing error keywords from cross with title from openalex
try:
type = openalex_results['type']
except:
pass
if "error" in pub_name: # attempt replacing error keywords from cross with title from openalex
try:
pub_name = openalex_results['primary_location']
except:
pass

if "error" in pub_date: # attempt replacing error keywords from cross with title from openalex
try:
pub_date = openalex_results['publication_date']
except:
pass

if "error" in keywords: # attempt replacing error keywords from cross with title from openalex
try:
title = openalex_results['title']
except:
pass

try:
openaccess = openalex_results['open_access']
keywords = openalex_results['keywords']
except:
pass
keywords = "none"


#%% Constructing output dictionary
output_dict = {
# Paper Metadata
'title':title,
'authors':authors_info,
'abstract':abstract,
'scopus_id':scopus_id,
'paperId':paper_id,
'publisher':pub_name,
'datePublished':pub_date,
'type':type,
'keywords':keywords,
'about':subject,
'fields_of_study':field_of_study,
'institution_names':inst_names,
'references':refs,
'tldr':tldr,
'original_text':original_text,
'openAccessPdf':openaccess_pdf,
'URL_link': url_link,
'openalex_id': openalex_id,
Expand Down Expand Up @@ -438,53 +402,35 @@ def get_orcid(authors):

for author in authors:
try:
url = "https://api.openalex.org/autocomplete/authors?q=" + author
url = "https://api.openalex.org/authors?search=" + author
response = json.loads(requests.get(url).text)
except Exception as e: # Added variable 'e' to catch the exception
print(f"OpenAlex ORCID lookup returned error: {e}\n")
continue # Skip to the next author

if response["meta"]["count"] == 1:
orcid = response["results"][0]["external_id"]
affiliation = response["results"][0]["hint"]
elif response["meta"]["count"] == 0:
#print(response)
if response["meta"]["count"] >= 1:
orcid = response["results"][0]["orcid"]
print(orcid)
affiliation = response["results"][0]["affiliations"][0]["institution"]["display_name"]
display_name = response["results"][0]["display_name"] # Updated to use display_name

author_info = {
"@id": f"https://orcid.org/{orcid}",
"role": "Person",
"affiliation": affiliation,
"name": display_name
}

orcid_info[author] = author_info

else:
print("None, There are no OrcID suggestions for this author")
orcid_info[author] = "none"
continue # Skip to the next author
else:
orcid = response["results"][0]["external_id"]
affiliation = response["results"][0]["hint"]

author_info = {
"orcid": orcid,
"affiliation": affiliation
}

orcid_info[author] = author_info

return orcid_info

#def get_orcid(authors):
orcid = []
author_info = {}

for author in authors:
try:
url = "https://api.openalex.org/autocomplete/authors?q=" + author
response = json.loads(requests.get(url).text)
except:
print(f"OpenAlex ORCID lookup returned error: {e}\n")

if response["meta"]["count"] == 1:
orcid = response["results"][0]["external_id"]
author_info[author] = {"orcid": orcid, "affiliation":response["results"][0]["hint"]}
elif response["meta"]["count"] == 0: #FAKE - Create a test so we can check if the return is valid.
print("None, There are no OrcID suggestions for this author")
else:
orcid = response["results"][0]["external_id"]
author_info[author] = {"orcid": orcid, "affiliation": response["results"][0]["hint"]}
#create an async function which ranks the authors based on the similarity to the paper.

return author_info

def check_item_filled(json_ld, name):
for item in json_ld["@graph"]:
Expand All @@ -511,7 +457,7 @@ def update_json_ld(json_ld, new_data):
if author_info is dict:
orchid = author_info.get("orcid")
organization = author_info.get("affiliation")

if orchid:
creator_entry["@id"] = orchid
if organization:
Expand All @@ -529,11 +475,11 @@ def update_json_ld(json_ld, new_data):


#%% Main, general case for testing
if __name__ == "__main__":
"""if __name__ == "__main__":
print("Starting code run...")
node = "46" #os.getenv('NODE_ENV')
DOI_env = "10.3847/0004-637X/828/1/46" #os.getenv('DOI_ENV')
DOI_env = "10.3847/0004-637X/828/1/46"#os.getenv('DOI_ENV') #
if node is not None:
print(f"NODE_ENVIRONMENT is set to: {node}")
Expand All @@ -557,3 +503,36 @@ def update_json_ld(json_ld, new_data):
print(updated_json_ld)
print("Script completed")
"""
def run(node, doi=None):
print("Starting code run...")

#node = "46" #os.getenv('NODE_ENV')
#DOI_env = "10.3847/0004-637X/828/1/46"#os.getenv('DOI_ENV') #

if node is not None:
print(f"NODE_ENVIRONMENT is set to: {node}")
else:
print("NODE_ENVIRONMENT is not set.")

json_ld = get_jsonld(node)
print(json_ld)

if doi:
lookup_results = paper_data_json_single(doi)
#updated_json_ld = update_json_ld(json_ld, lookup_results)

else:
updated_json_ld = json_ld

llm_output = asyncio.run(langchain_paper_search(node))# output of unstructured text in dictionary
#updated_json_ld = update_json_ld(json_ld, llm_output)
updated_json_ld = json_ld
#doi = "https://doi.org/10.1002/adma.202208113"

#print(updated_json_ld)

print("Script completed")

if __name__ == "__main__":
run("46", "https://doi.org/10.1002/adma.202208113")
Loading

0 comments on commit 43f4cf1

Please sign in to comment.