-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathScraping_Beautifulsoup4.py
More file actions
executable file
·130 lines (85 loc) · 3.19 KB
/
Scraping_Beautifulsoup4.py
File metadata and controls
executable file
·130 lines (85 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# web Scraping in the Python using the Beautifulsoup4
# pip install beautifulsoup4
# pip install lxml
# pip install html5lib
# pip install requests
from bs4 import BeautifulSoup
import requests
#paasing the html file
# with open ('example.html') as html_file:
# soup = BeautifulSoup(html_file, 'lxml')
# #print(soup.prettify()) #Parsing the whole content
# match1 = soup.title
# match2 = soup.title.text
#match = soup.div #getting the first div
#using the find method to search for the div with class footer
# print(match)
# print(match1)
# print(match2)
#-----------------------------------------------------------
#Scraping the headline and the summary from example.html
# article= soup.find('div', class_ ='article') #class_ bcz , in py class is special keyword
# print(article)
# headline = article.h2.a.text
# summary = article.p.text
# print(summary)
# print()
# #or
# for article in soup.find_all('div',class_ ='article'):
# headline = article.h2.a.text
# print(headline)
# summary = article.p.text
# print(summary)
# print()
#---------------------------------------------------------------------------------
# Scrapping the website
#--------------------------------------------------------
import csv
from bs4 import BeautifulSoup
import requests
# source = requests.get('https://www.patreon.com/coreyms').text
source = requests.get('https://www.ambitionbox.com/overview/tcs-overview').text
soup = BeautifulSoup(source,'html.parser')
print(soup.prettify())
print()
import requests
url = "https://hicounselor.com/scan-resume"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/111.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}
response = requests.get(url, headers=headers)
print(response.text)
# csv_file = open('cms_scrape.csv','w')
# csv_writer = csv.writer(csv_file)
# csv_writer.writerow([headline,summary])
#---------------------------------------------------------------------------------------
# scraping the youtube video and id
#------------------------------------
# for article in soup.find_all('article'):
# headline = article.h2.text
# print(headline)
# summary = article.find('div', class_ = 'entry-content').p.text
# print(summary)
# article = soup.find('div', class_ = 'sc-bdvvtL lhrfPG') #.p.txt
# print(article.prettify())
# try:
# vid_src = article.find('iframe', class_ = 'youtube-player')['src'] #iframe as a attribute for the src attribute of that tag
# print(vid_src)
# #spiliting the id string
# vid_id = vid_src.spilit('/')
# print(vid_id)
# # as a list the item is hown with the index value
# #link at the 4 index
# vid_id = vid_id.split('/')[4]
# vid_id = vid_id.spilit('?')[0]# spliting based on ?
# print(vid_id) #video id = K6L6KVGG-7o
# yt_link = f'https://youtube.com/watch?v={vid_id}'
# print(yt_link)
# except Exception as e:
# raise e
# print()
# csv_writer.writerow([headline, summary, yt_link])
# csv_file.close()