learn_python3/Python3/Web_scraping_Python/Scraping_Beautifulsoup4.py at main · legionJP/learn_python3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# web Scraping in the Python using the Beautifulsoup4

# pip install beautifulsoup4

# pip install lxml
# pip install html5lib

# pip install requests

from bs4 import BeautifulSoup
import requests

#paasing the html file

# with open ('example.html') as html_file:
#     soup = BeautifulSoup(html_file, 'lxml')

# #print(soup.prettify()) #Parsing the whole content

# match1 = soup.title
# match2 = soup.title.text
#match = soup.div #getting the first div
#using  the find method to search for the div with class footer
# print(match)
# print(match1)
# print(match2)

#-----------------------------------------------------------
#Scraping  the headline and the summary from example.html

# article= soup.find('div', class_ ='article') #class_ bcz , in py class is special keyword
# print(article)
# headline = article.h2.a.text
# summary = article.p.text
# print(summary)
# print()
# #or

# for article in soup.find_all('div',class_ ='article'):
#     headline = article.h2.a.text
#     print(headline)

#     summary = article.p.text
#     print(summary)

#     print()


#---------------------------------------------------------------------------------
# Scrapping the website

#--------------------------------------------------------

import csv
from bs4 import BeautifulSoup
import requests

# source = requests.get('https://www.patreon.com/coreyms').text
source = requests.get('https://www.ambitionbox.com/overview/tcs-overview').text

soup = BeautifulSoup(source,'html.parser')

print(soup.prettify())
print()


import requests

url = "https://hicounselor.com/scan-resume"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/111.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}

response = requests.get(url, headers=headers)

print(response.text)

# csv_file = open('cms_scrape.csv','w')
# csv_writer = csv.writer(csv_file)
# csv_writer.writerow([headline,summary])

#---------------------------------------------------------------------------------------
# scraping the youtube video and id
#------------------------------------

# for article in soup.find_all('article'):
#     headline = article.h2.text
#     print(headline)

#     summary = article.find('div', class_ = 'entry-content').p.text
#     print(summary)

#     article = soup.find('div', class_ = 'sc-bdvvtL lhrfPG') #.p.txt
#     print(article.prettify())

#     try:
#         vid_src = article.find('iframe', class_ = 'youtube-player')['src'] #iframe as a attribute for the src  attribute of that tag
#         print(vid_src)

#     #spiliting the id string
#         vid_id = vid_src.spilit('/')
#         print(vid_id)
#      # as a list the item is hown with the index value
#      #link at the 4 index

#         vid_id = vid_id.split('/')[4]
#         vid_id = vid_id.spilit('?')[0]# spliting based on ?
#         print(vid_id) #video id = K6L6KVGG-7o

#         yt_link = f'https://youtube.com/watch?v={vid_id}'
#         print(yt_link)

#     except Exception as e:
#         raise e
#     print()

#     csv_writer.writerow([headline, summary, yt_link])
#     csv_file.close()