This repository has been archived by the owner on Aug 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit_api_scraping.py
152 lines (111 loc) · 4.95 KB
/
reddit_api_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def get_submission_score(reddit, submission_id):
from math import ceil
from numpy import nan as np_nan
try:
submission = reddit.submission(id=submission_id)
submission_score = submission.score
except:
return np_nan # Return NaN if the submission is hidden or inaccessible for some reason
return submission_score
def get_comment_score(reddit, comment_id):
from numpy import nan as np_nan
try:
comment = reddit.comment(id=comment_id)
comment_score = comment.score
except:
return np_nan # Return NaN if the comment is hidden or inaccessible for some reason
return comment_score
def scrape_post_scores(reddit, ids):
scores = []
for id in ids:
score = get_submission_score(reddit, id)
print(f"ID: {id} - Score: {score}")
scores.append(score)
return scores
def scrape_post_votes(reddit, path, fname, save_path):
import pandas as pd
# Read the csv file containing the submission ids for the posts that I have voted
post_votes = pd.read_csv(path + fname)
# Get votes
scores = scrape_post_scores(reddit, post_votes["id"])
# Add the upvotes and downvotes to the dataframe
post_votes["Scores"] = scores
# Save the dataframe to a csv file
post_votes.to_csv(save_path + fname, index=False)
print(f"Saved the vote counts to {save_path + fname}")
def scrape_comment_votes(reddit, path, fname, save_path):
import pandas as pd
# Read the csv file containing the submission ids for the comments that I have voted
comment_votes = pd.read_csv(path + fname)
scores = []
for id in comment_votes["id"]:
score = get_comment_score(reddit, id)
print(f"ID: {id} - Score: {score}")
scores.append(score)
# Add the scores to the dataframe
comment_votes["Score"] = scores
# Save the dataframe to a csv file
comment_votes.to_csv(save_path + fname, index=False)
print(f"Saved the vote counts to {save_path + fname}")
def get_sub_flairs(reddit, path, fname, save_path):
import pandas as pd
# Read the csv file containing the subreddit names
subreddits_df = pd.read_csv(path + fname)
# Add flairs column to the dataframe
subreddits_df["Flairs"] = None
# Get the flairs for each subreddit
for sub_name in subreddits_df["subreddit"]:
sub = reddit.subreddit(sub_name)
try:
flairs = []
for f in sub.flair.link_templates:
flairs.append(f["text"].lower())
flairs = ", ".join(flairs)
except:
flairs = None
# Add the flairs to the dataframe
subreddits_df.loc[subreddits_df["subreddit"] == sub_name, "Flairs"] = flairs
# Save the dataframe to the csv file
subreddits_df.to_csv(save_path + fname, index=False)
print(f"Saved the sub flairs to {save_path + fname}")
def main():
import pandas as pd
import praw
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Get Reddit client
# Authenticate with Reddit using password flow
reddit = praw.Reddit(
client_id=os.getenv("CLIENT_ID"),
client_secret=os.getenv("CLIENT_SECRET"),
password=os.getenv("REDDIT_PASSWORD"),
username=os.getenv("REDDIT_USERNAME"),
user_agent="script:myRedditDataCollecterBot:v1.0.0 by u/personalScraper",
)
raw_data_path = "./data/raw_data/"
save_path = "./data/scrapped_data/"
print("Getting vote counts for posts...")
scrape_post_votes(reddit, raw_data_path, "post_votes.csv", save_path=save_path)
scrape_post_votes(reddit, raw_data_path, "post_headers.csv", save_path=save_path)
print("Getting vote counts for comments...")
scrape_comment_votes(reddit, raw_data_path, "comment_votes.csv", save_path=save_path)
scrape_comment_votes(reddit, raw_data_path, "comment_headers.csv", save_path=save_path)
print("Getting flairs for the subreddits...")
get_sub_flairs(reddit, raw_data_path, "subscribed_subreddits.csv", save_path=save_path)
print("Combining scrapped data with annotated tags...")
# Combine annotated tags with scrapped data
annotated_data = pd.read_csv(raw_data_path + "tags.csv")
scrapped_data = pd.read_csv(save_path + "subscribed_subreddits.csv")
scrapped_data["Tags"] = None # Add a new column for the tags
# Read the tags for each sub
for col in annotated_data.columns[1:]: # Skip the first column which is the timestamp
sub_name = col.split(" ")[0][2:]
tags = annotated_data[col].map(lambda x: x.split(", ")).explode().value_counts()
accepted_tags = tags[tags >= tags.mean()].sort_values().index.to_list()[:3]
scrapped_data.loc[scrapped_data["subreddit"] == sub_name, "Tags"] = ", ".join(accepted_tags)
# Save the dataframe to the csv file
scrapped_data.to_csv(save_path + "subscribed_subreddits.csv", index=False)
if __name__ == "__main__":
main()