Rotten-Scripts/Python/Reddit_Scraper/Reddit_scraper.py at master · HarshCasper/Rotten-Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import requests
import csv
import time
from bs4 import BeautifulSoup


def write_to_csv(row_array):
    """
    The function stores the scraped info in a .csv file
    :param row_array:
    :return:
    """
    # Headings for the first row of the file
    header_list = ["Title", "Author", "Date and Time", "Upvotes", "Comments", "Url"]
    file_name = input("\nEnter the name of file to store the info: ")

    # Adding info into the rows of the file
    with open(file_name + ".csv", "a", encoding="utf-8") as csv_f:
        csv_pointer = csv.writer(csv_f, delimiter=",")
        csv_pointer.writerow(header_list)
        csv_pointer.writerows(row_array)

    print(f"Done! Check your directory for {file_name}.csv file!")


def scraper():
    """
    The function scrapes the post info from the desired subreddit and stores it
    into the desired file.
    :return:
    """
    subreddit = input("Enter the name of the subreddit: r/").lower()
    max_count = int(input("Enter the maximum number of entries to collect: "))

    # Generating the URL leading to the desired subreddit
    url = "https://old.reddit.com/r/" + subreddit

    # Using a user-agent to mimic browser activity
    headers = {"User-Agent": "Mozilla/5.0"}

    req = requests.get(url, headers=headers)

    if req.status_code == 200:
        # Parsing through the web page for obtaining the right html tags and
        # scraping the details required
        soup = BeautifulSoup(req.text, "html.parser")
        print("\nCOLLECTING INFORMATION....")

        attrs = {"class": "thing"}
        counter = 1
        full = 0
        reddit_info = []
        while 1:
            for post in soup.find_all("div", attrs=attrs):
                try:
                    title = post.find("a", class_="title").text

                    author = post.find("a", class_="author").text

                    time_stamp = post.time.attrs["title"]

                    comments = post.find("a", class_="comments").text.split()[0]
                    if comments == "comment":
                        comments = 0

                    upvotes = post.find("div", class_="score likes").text
                    if upvotes == "•":
                        upvotes = "None"

                    link = post.find("a", class_="title")["href"]
                    link = "www.reddit.com" + link

                    # Storing the scraped data in an array
                    reddit_info.append(
                        [title, author, time_stamp, upvotes, comments, link]
                    )

                    if counter == max_count:
                        full = 1
                        break

                    counter += 1
                except AttributeError:
                    continue

            if full:
                break

            try:
                # To go to the next page
                next_button = soup.find("span", class_="next-button")
                next_page_link = next_button.find("a").attrs["href"]

                time.sleep(2)

                req = requests.get(next_page_link, headers=headers)
                soup = BeautifulSoup(req.text, "html.parser")
            except:
                break

        # Writing the stored information in a .csv file
        print("DONE!\n")
        write_to_csv(reddit_info)

    else:
        print("Error fetching results.. Try again!")


if __name__ == "__main__":
    scraper()