diff --git a/examples/cdp_mode/ReadMe.md b/examples/cdp_mode/ReadMe.md index 4e838b1bccb..2e0b43d3923 100644 --- a/examples/cdp_mode/ReadMe.md +++ b/examples/cdp_mode/ReadMe.md @@ -471,6 +471,7 @@ sb.cdp.get_current_url() sb.cdp.get_origin() sb.cdp.get_html(include_shadow_dom=True) sb.cdp.get_page_source(include_shadow_dom=True) +sb.cdp.get_beautiful_soup(source=None) sb.cdp.get_user_agent() sb.cdp.get_cookie_string() sb.cdp.get_locale_code() diff --git a/examples/cdp_mode/playwright/ReadMe.md b/examples/cdp_mode/playwright/ReadMe.md index c6b14fb3a93..cbc375382e6 100644 --- a/examples/cdp_mode/playwright/ReadMe.md +++ b/examples/cdp_mode/playwright/ReadMe.md @@ -88,6 +88,12 @@ The `SB()` format requires WebDriver, therefore `chromedriver` will be downloade In the sync formats, `get_endpoint_url()` also applies `nest-asyncio` so that nested event loops are allowed. (Python doesn't allow nested event loops by default). Without this, you'd get the error: `"Cannot run the event loop while another loop is running"` when calling CDP Mode methods (such as `solve_captcha()`) from within the Playwright context manager. This `nest-asyncio` call is done behind-the-scenes so that users don't need to handle this on their own. +Default timeout values are different between Playwright and SeleniumBase. For instance, a 30-second default timeout in a Playwright method might be 10 seconds in the equivalent SeleniumBase method. When specifying custom timeout values, Playwright uses milliseconds, whereas SeleniumBase uses seconds. Eg. `page.wait_for_timeout(500)` is the equivalent of `sb.sleep(0.5)`. + +Playwright's `:has-text()` selector is the equivalent of SeleniumBase's `:contains()` selector, except for one small difference: `:has-text()` isn't case-sensitive, but `:contains()` is. + +Unlike normal Playwright, you don't need to run `playwright install` before running Stealthy Playwright Mode scripts because the system Chrome will be used. There's also the option of setting `use_chromium=True` to use the unbranded Chromium browser instead, which still supports extensions. + ### 🎭 Stealthy Playwright Mode examples: Here's an example that queries Microsoft Copilot: @@ -105,16 +111,16 @@ with sync_playwright() as p: page = context.pages[0] page.goto("https://copilot.microsoft.com") page.wait_for_selector("textarea#userInput") - sb.sleep(1) + page.wait_for_timeout(1000) query = "Playwright Python connect_over_cdp() sync example" page.fill("textarea#userInput", query) page.click('button[data-testid="submit-button"]') - sb.sleep(3) + page.wait_for_timeout(4000) sb.solve_captcha() page.wait_for_selector('button[data-testid*="-thumbs-up"]') - sb.sleep(4) + page.wait_for_timeout(4000) page.click('button[data-testid*="scroll-to-bottom"]') - sb.sleep(3) + page.wait_for_timeout(3000) chat_results = '[data-testid="highlighted-chats"]' result = page.locator(chat_results).inner_text() print(result.replace("\n\n", " \n")) @@ -134,9 +140,9 @@ with sync_playwright() as p: context = browser.contexts[0] page = context.pages[0] page.goto("https://www.bing.com/turing/captcha/challenge") - sb.sleep(3) + page.wait_for_timeout(2000) sb.solve_captcha() - sb.sleep(3) + page.wait_for_timeout(2000) ``` -------- diff --git a/examples/cdp_mode/playwright/raw_basic_async.py b/examples/cdp_mode/playwright/raw_basic_async.py index 4419282a932..db3a72e7059 100644 --- a/examples/cdp_mode/playwright/raw_basic_async.py +++ b/examples/cdp_mode/playwright/raw_basic_async.py @@ -16,7 +16,7 @@ async def main(): await page.fill("#password", "secret_pass") await page.click("#log-in") await page.wait_for_selector("h1") - await driver.sleep(1) + await page.wait_for_timeout(1000) if __name__ == "__main__": diff --git a/examples/cdp_mode/playwright/raw_basic_nested.py b/examples/cdp_mode/playwright/raw_basic_nested.py index cf0214a92be..e8e22b7995e 100644 --- a/examples/cdp_mode/playwright/raw_basic_nested.py +++ b/examples/cdp_mode/playwright/raw_basic_nested.py @@ -14,4 +14,4 @@ page.fill("#password", "secret_pass") page.click("#log-in") page.wait_for_selector("h1") - sb.sleep(1) + page.wait_for_timeout(1000) diff --git a/examples/cdp_mode/playwright/raw_basic_sync.py b/examples/cdp_mode/playwright/raw_basic_sync.py index 5115051259d..bd8038152c9 100644 --- a/examples/cdp_mode/playwright/raw_basic_sync.py +++ b/examples/cdp_mode/playwright/raw_basic_sync.py @@ -13,4 +13,4 @@ page.fill("#password", "secret_pass") page.click("#log-in") page.wait_for_selector("h1") - sb.sleep(1) + page.wait_for_timeout(1000) diff --git a/examples/cdp_mode/playwright/raw_bing_cap_async.py b/examples/cdp_mode/playwright/raw_bing_cap_async.py index 7781aa96a2e..47c84f01699 100644 --- a/examples/cdp_mode/playwright/raw_bing_cap_async.py +++ b/examples/cdp_mode/playwright/raw_bing_cap_async.py @@ -12,9 +12,9 @@ async def main(): context = browser.contexts[0] page = context.pages[0] await page.goto("https://www.bing.com/turing/captcha/challenge") - await driver.sleep(3) + await page.wait_for_timeout(2000) await driver.solve_captcha() - await driver.sleep(3) + await page.wait_for_timeout(2000) if __name__ == "__main__": diff --git a/examples/cdp_mode/playwright/raw_bing_cap_nested.py b/examples/cdp_mode/playwright/raw_bing_cap_nested.py index 3c9cb9f9d97..f414bf9cc57 100644 --- a/examples/cdp_mode/playwright/raw_bing_cap_nested.py +++ b/examples/cdp_mode/playwright/raw_bing_cap_nested.py @@ -10,6 +10,6 @@ context = browser.contexts[0] page = context.pages[0] page.goto("https://www.bing.com/turing/captcha/challenge") - sb.sleep(3) + page.wait_for_timeout(2000) sb.solve_captcha() - sb.sleep(3) + page.wait_for_timeout(2000) diff --git a/examples/cdp_mode/playwright/raw_bing_cap_sync.py b/examples/cdp_mode/playwright/raw_bing_cap_sync.py index 71242af8cca..9e4f6f54d5c 100644 --- a/examples/cdp_mode/playwright/raw_bing_cap_sync.py +++ b/examples/cdp_mode/playwright/raw_bing_cap_sync.py @@ -9,6 +9,6 @@ context = browser.contexts[0] page = context.pages[0] page.goto("https://www.bing.com/turing/captcha/challenge") - sb.sleep(3) + page.wait_for_timeout(2000) sb.solve_captcha() - sb.sleep(3) + page.wait_for_timeout(2000) diff --git a/examples/cdp_mode/playwright/raw_cf_cap_sync.py b/examples/cdp_mode/playwright/raw_cf_cap_sync.py index 8bfa09269b1..348d0d9041b 100644 --- a/examples/cdp_mode/playwright/raw_cf_cap_sync.py +++ b/examples/cdp_mode/playwright/raw_cf_cap_sync.py @@ -9,6 +9,6 @@ context = browser.contexts[0] page = context.pages[0] page.goto("https://www.cloudflare.com/login") - sb.sleep(3) + page.wait_for_timeout(4500) sb.solve_captcha() - sb.sleep(3) + page.wait_for_timeout(3000) diff --git a/examples/cdp_mode/playwright/raw_copilot_async.py b/examples/cdp_mode/playwright/raw_copilot_async.py index 19ecc39ee25..cea94adaa33 100644 --- a/examples/cdp_mode/playwright/raw_copilot_async.py +++ b/examples/cdp_mode/playwright/raw_copilot_async.py @@ -13,16 +13,16 @@ async def main(): page = context.pages[0] await page.goto("https://copilot.microsoft.com") await page.wait_for_selector("textarea#userInput") - await driver.sleep(1) + await page.wait_for_timeout(1000) query = "Playwright Python connect_over_cdp() sync example" await page.fill("textarea#userInput", query) await page.click('button[data-testid="submit-button"]') - await driver.sleep(4) + await page.wait_for_timeout(4000) await driver.solve_captcha() await page.wait_for_selector('button[data-testid*="-thumbs-up"]') - await driver.sleep(4) + await page.wait_for_timeout(4000) await page.click('button[data-testid*="scroll-to-bottom"]') - await driver.sleep(3) + await page.wait_for_timeout(3000) chat_results = '[data-testid="highlighted-chats"]' result = await page.locator(chat_results).inner_text() print(result.replace("\n\n", " \n")) diff --git a/examples/cdp_mode/playwright/raw_copilot_nested.py b/examples/cdp_mode/playwright/raw_copilot_nested.py index 9a01f70dfd2..fa0b74b8766 100644 --- a/examples/cdp_mode/playwright/raw_copilot_nested.py +++ b/examples/cdp_mode/playwright/raw_copilot_nested.py @@ -11,16 +11,16 @@ page = context.pages[0] page.goto("https://copilot.microsoft.com") page.wait_for_selector("textarea#userInput") - sb.sleep(1) + page.wait_for_timeout(1000) query = "Playwright Python connect_over_cdp() sync example" page.fill("textarea#userInput", query) page.click('button[data-testid="submit-button"]') - sb.sleep(4) + page.wait_for_timeout(4000) sb.solve_captcha() page.wait_for_selector('button[data-testid*="-thumbs-up"]') - sb.sleep(4) + page.wait_for_timeout(4000) page.click('button[data-testid*="scroll-to-bottom"]') - sb.sleep(3) + page.wait_for_timeout(3000) chat_results = '[data-testid="highlighted-chats"]' result = page.locator(chat_results).inner_text() print(result.replace("\n\n", " \n")) diff --git a/examples/cdp_mode/playwright/raw_copilot_sync.py b/examples/cdp_mode/playwright/raw_copilot_sync.py index 46dc41d88c8..451f44ce873 100644 --- a/examples/cdp_mode/playwright/raw_copilot_sync.py +++ b/examples/cdp_mode/playwright/raw_copilot_sync.py @@ -10,16 +10,16 @@ page = context.pages[0] page.goto("https://copilot.microsoft.com") page.wait_for_selector("textarea#userInput") - sb.sleep(1) + page.wait_for_timeout(1000) query = "Playwright Python connect_over_cdp() sync example" page.fill("textarea#userInput", query) page.click('button[data-testid="submit-button"]') - sb.sleep(4) + page.wait_for_timeout(4000) sb.solve_captcha() page.wait_for_selector('button[data-testid*="-thumbs-up"]') - sb.sleep(4) + page.wait_for_timeout(4000) page.click('button[data-testid*="scroll-to-bottom"]') - sb.sleep(3) + page.wait_for_timeout(3000) chat_results = '[data-testid="highlighted-chats"]' result = page.locator(chat_results).inner_text() print(result.replace("\n\n", " \n")) diff --git a/examples/cdp_mode/playwright/raw_gas_info_async.py b/examples/cdp_mode/playwright/raw_gas_info_async.py index 404b2e4e6c3..9807db39abe 100644 --- a/examples/cdp_mode/playwright/raw_gas_info_async.py +++ b/examples/cdp_mode/playwright/raw_gas_info_async.py @@ -18,21 +18,22 @@ async def main(): "/order-replacement-building-regulations-certificate/" ) await page.goto(url) - await tab.sleep(0.6) + await page.wait_for_timeout(600) await tab.solve_captcha() await page.wait_for_selector("#SearchTerm") - await tab.sleep(1.4) + await page.wait_for_timeout(2000) allow_cookies = 'button:contains("Allow all cookies")' await tab.click_if_visible(allow_cookies, timeout=2) - await tab.sleep(1) + await page.wait_for_timeout(1000) await page.fill("#SearchTerm", "Hydrogen") + await tab.click_if_visible(allow_cookies, timeout=1) await page.click("button.search-button") - await tab.sleep(3) + await page.wait_for_timeout(3000) results = await tab.query_selector_all("div.search-result") for result in results: print(result.text.replace(" " * 12, " ").strip() + "\n") await tab.scroll_down(50) - await tab.sleep(1) + await page.wait_for_timeout(1000) if __name__ == "__main__": diff --git a/examples/cdp_mode/playwright/raw_gas_info_sync.py b/examples/cdp_mode/playwright/raw_gas_info_sync.py index 3880ff62ea8..a948a0f048f 100644 --- a/examples/cdp_mode/playwright/raw_gas_info_sync.py +++ b/examples/cdp_mode/playwright/raw_gas_info_sync.py @@ -14,19 +14,20 @@ "/order-replacement-building-regulations-certificate/" ) page.goto(url) - sb.sleep(0.6) + page.wait_for_timeout(600) sb.solve_captcha() page.wait_for_selector("#SearchTerm") - sb.sleep(1.4) + page.wait_for_timeout(2000) allow_cookies = 'button:contains("Allow all cookies")' sb.click_if_visible(allow_cookies, timeout=2) - sb.sleep(1) + page.wait_for_timeout(1000) page.fill("#SearchTerm", "Hydrogen") + sb.click_if_visible(allow_cookies, timeout=1) page.click("button.search-button") - sb.sleep(3) + page.wait_for_timeout(3000) items = page.locator("div.search-result") for i in range(items.count()): item_text = items.nth(i).inner_text() print(item_text.replace("\n\n", "\n") + "\n") sb.scroll_to_bottom() - sb.sleep(1) + page.wait_for_timeout(3000) diff --git a/examples/cdp_mode/playwright/raw_gitlab_async.py b/examples/cdp_mode/playwright/raw_gitlab_async.py index 32687079686..978d148196a 100644 --- a/examples/cdp_mode/playwright/raw_gitlab_async.py +++ b/examples/cdp_mode/playwright/raw_gitlab_async.py @@ -12,13 +12,13 @@ async def main(): context = browser.contexts[0] page = context.pages[0] await page.goto("https://gitlab.com/users/sign_in") - await driver.sleep(3) + await page.wait_for_timeout(3000) await driver.solve_captcha() - await driver.sleep(1) + await page.wait_for_timeout(1000) await page.locator('label[for="user_login"]').click() await page.wait_for_selector('[data-testid="sign-in-button"]') await page.locator("#user_login").fill("Username") - await driver.sleep(2) + await page.wait_for_timeout(2000) if __name__ == "__main__": diff --git a/examples/cdp_mode/playwright/raw_gitlab_nested.py b/examples/cdp_mode/playwright/raw_gitlab_nested.py index eafc7b29019..75b96df1dd1 100644 --- a/examples/cdp_mode/playwright/raw_gitlab_nested.py +++ b/examples/cdp_mode/playwright/raw_gitlab_nested.py @@ -10,10 +10,10 @@ context = browser.contexts[0] page = context.pages[0] page.goto("https://gitlab.com/users/sign_in") - sb.sleep(3) + page.wait_for_timeout(3000) sb.solve_captcha() - sb.sleep(1) + page.wait_for_timeout(1000) page.locator('label[for="user_login"]').click() page.wait_for_selector('[data-testid="sign-in-button"]') page.locator("#user_login").fill("Username") - sb.sleep(2) + page.wait_for_timeout(2000) diff --git a/examples/cdp_mode/playwright/raw_gitlab_sync.py b/examples/cdp_mode/playwright/raw_gitlab_sync.py index 337c9984644..95253a0c92f 100644 --- a/examples/cdp_mode/playwright/raw_gitlab_sync.py +++ b/examples/cdp_mode/playwright/raw_gitlab_sync.py @@ -9,10 +9,10 @@ context = browser.contexts[0] page = context.pages[0] page.goto("https://gitlab.com/users/sign_in") - sb.sleep(3) + page.wait_for_timeout(3000) sb.solve_captcha() - sb.sleep(1) + page.wait_for_timeout(1000) page.locator('label[for="user_login"]').click() page.wait_for_selector('[data-testid="sign-in-button"]') page.locator("#user_login").fill("Username") - sb.sleep(2) + page.wait_for_timeout(2000) diff --git a/examples/cdp_mode/playwright/raw_indeed_sync.py b/examples/cdp_mode/playwright/raw_indeed_sync.py new file mode 100644 index 00000000000..827f6698031 --- /dev/null +++ b/examples/cdp_mode/playwright/raw_indeed_sync.py @@ -0,0 +1,33 @@ +from playwright.sync_api import sync_playwright +from seleniumbase import sb_cdp + +sb = sb_cdp.Chrome() +sb.open("https://www.indeed.com/companies/search") +endpoint_url = sb.get_endpoint_url() + +with sync_playwright() as p: + browser = p.chromium.connect_over_cdp(endpoint_url) + context = browser.contexts[0] + page = context.pages[0] + search_box = "input#company-search" + if page.locator(search_box).count() == 0: + page.wait_for_timeout(2500) + sb.solve_captcha() + page.wait_for_timeout(1000) + company = "NASA Jet Propulsion Laboratory" + page.click(search_box) + page.fill(search_box, company) + page.click('button[type="submit"]') + page.click('a:has-text("%s")' % company) + name_header = 'div[itemprop="name"]' + page.wait_for_timeout(1000) + if page.locator(name_header).count() == 0: + page.wait_for_timeout(2500) + sb.solve_captcha() + page.wait_for_timeout(1000) + for i in range(10): + sb.scroll_down(12) + sb.sleep(0.14) + info = page.locator('[data-testid="AboutSection-section"]') + soup = sb.get_beautiful_soup(info.inner_html()).get_text("\n") + print("*** %s: ***\n%s" % (company, soup)) diff --git a/examples/cdp_mode/playwright/raw_nike_sync.py b/examples/cdp_mode/playwright/raw_nike_sync.py index 5cb04ba8cd8..a453b47e963 100644 --- a/examples/cdp_mode/playwright/raw_nike_sync.py +++ b/examples/cdp_mode/playwright/raw_nike_sync.py @@ -12,7 +12,7 @@ page.click('[data-testid="user-tools-container"] search') search = "Pegasus" page.fill('input[type="search"]', search) - sb.sleep(4) + page.wait_for_timeout(4000) details = 'ul[data-testid*="products"] figure .details' items = page.locator(details) if items: diff --git a/examples/cdp_mode/playwright/raw_nordstrom_sync.py b/examples/cdp_mode/playwright/raw_nordstrom_sync.py index 3e9741530d0..777dca0fe2d 100644 --- a/examples/cdp_mode/playwright/raw_nordstrom_sync.py +++ b/examples/cdp_mode/playwright/raw_nordstrom_sync.py @@ -9,12 +9,13 @@ context = browser.contexts[0] page = context.pages[0] page.goto("https://www.nordstrom.com/") - sb.sleep(2) + page.wait_for_timeout(2000) page.click("input#keyword-search-input") - sb.sleep(0.8) + page.wait_for_timeout(800) search = "cocktail dresses for women teal" - sb.press_keys("input#keyword-search-input", search + "\n") - sb.sleep(2.2) + search_box = page.locator("input#keyword-search-input") + search_box.press_sequentially(search + "\n", delay=80) + page.wait_for_timeout(2200) for i in range(17): sb.scroll_down(16) sb.sleep(0.14) diff --git a/examples/cdp_mode/playwright/raw_planetmc_sync.py b/examples/cdp_mode/playwright/raw_planetmc_sync.py index 13cd14c3d4d..6df381a3d33 100644 --- a/examples/cdp_mode/playwright/raw_planetmc_sync.py +++ b/examples/cdp_mode/playwright/raw_planetmc_sync.py @@ -9,7 +9,8 @@ context = browser.contexts[0] page = context.pages[0] page.goto("https://www.planetminecraft.com/account/sign_in/") - sb.sleep(2) + page.wait_for_timeout(2000) sb.solve_captcha() - sb.wait_for_element_absent("input[disabled]") - sb.sleep(2) + input_disabled = page.locator("input[disabled]") + input_disabled.wait_for(state="hidden", timeout=5000) + page.wait_for_timeout(2000) diff --git a/examples/cdp_mode/playwright/raw_reddit_sync.py b/examples/cdp_mode/playwright/raw_reddit_sync.py index 10c0c67996c..aec52edd9e8 100644 --- a/examples/cdp_mode/playwright/raw_reddit_sync.py +++ b/examples/cdp_mode/playwright/raw_reddit_sync.py @@ -12,7 +12,7 @@ url = f"https://www.reddit.com/r/webscraping/search/?q={search}" page.goto(url) sb.solve_captcha() # Might not be needed - sb.sleep(1) + page.wait_for_timeout(1000) post_title = '[data-testid="post-title"]' page.wait_for_selector(post_title) for i in range(8): diff --git a/examples/cdp_mode/playwright/raw_seatgeek_sync.py b/examples/cdp_mode/playwright/raw_seatgeek_sync.py index 5f64a307ab3..1f6342d597b 100644 --- a/examples/cdp_mode/playwright/raw_seatgeek_sync.py +++ b/examples/cdp_mode/playwright/raw_seatgeek_sync.py @@ -11,12 +11,13 @@ page.goto("https://seatgeek.com/") input_field = 'input[name="search"]' page.wait_for_selector(input_field) - sb.sleep(1.6) + page.wait_for_timeout(1600) query = "Jerry Seinfeld" - sb.press_keys(input_field, query) - sb.sleep(1.6) + search_box = page.locator(input_field) + search_box.press_sequentially(query, delay=80) + page.wait_for_timeout(1600) page.click("li#active-result-item") - sb.sleep(4.2) + page.wait_for_timeout(4200) print('*** SeatGeek Search for "%s":' % query) items = page.locator('[data-testid="listing-item"]') for i in range(items.count()): diff --git a/examples/cdp_mode/playwright/raw_walmart_sync.py b/examples/cdp_mode/playwright/raw_walmart_sync.py index 9289c71103e..13b288aaa71 100644 --- a/examples/cdp_mode/playwright/raw_walmart_sync.py +++ b/examples/cdp_mode/playwright/raw_walmart_sync.py @@ -2,26 +2,33 @@ from seleniumbase import sb_cdp sb = sb_cdp.Chrome(locale="en", guest=True) +sb.open("https://www.walmart.com/") endpoint_url = sb.get_endpoint_url() with sync_playwright() as p: browser = p.chromium.connect_over_cdp(endpoint_url) context = browser.contexts[0] page = context.pages[0] - page.goto("https://www.walmart.com/") - sb.sleep(2.6) + page.wait_for_timeout(2800) page.click('input[aria-label="Search"]') - sb.sleep(1.4) + page.wait_for_timeout(1800) search = "Settlers of Catan Board Game" required_text = "Catan" - sb.press_keys('input[aria-label="Search"]', search + "\n") - sb.sleep(3.8) + input_selector = 'input[aria-label="Search"]' + search_box = page.locator(input_selector) + search_box.press_sequentially(search + "\n", delay=80) + page.wait_for_timeout(3200) sb.remove_elements('[data-testid="skyline-ad"]') sb.remove_elements('[data-testid="sba-container"]') print('*** Walmart Search for "%s":' % search) print(' (Results must contain "%s".)' % required_text) unique_item = [] - sb.click_if_visible('[data-automation-id="sb-btn-close-mark"]') + pop_up = '[data-automation-id="sb-btn-close-mark"]' + if page.locator(pop_up).count() > 0: + page.click(pop_up) + page.wait_for_timeout(1200) + page.wait_for_selector('[data-item-id]', timeout=10000) + page.wait_for_timeout(600) items = page.locator('[data-item-id]') for i in range(items.count()): item = items.nth(i) @@ -33,9 +40,9 @@ and description.inner_text() not in unique_item ): unique_item.append(description.inner_text()) - print("* " + description.inner_text()) price = item.locator('[data-automation-id="product-price"]') - if price: + if price.count() > 0: + print("* " + description.inner_text()) price_text = price.inner_text() price_text = price_text.split("current price Now ")[-1] price_text = price_text.split("current price ")[-1] diff --git a/help_docs/cdp_mode_methods.md b/help_docs/cdp_mode_methods.md index 3e91ef9bd14..6256b8128f8 100644 --- a/help_docs/cdp_mode_methods.md +++ b/help_docs/cdp_mode_methods.md @@ -99,6 +99,7 @@ sb.cdp.get_current_url() sb.cdp.get_origin() sb.cdp.get_html(include_shadow_dom=True) sb.cdp.get_page_source(include_shadow_dom=True) +sb.cdp.get_beautiful_soup(source=None) sb.cdp.get_user_agent() sb.cdp.get_cookie_string() sb.cdp.get_locale_code() diff --git a/requirements.txt b/requirements.txt index 4bda62596de..ec7408c2b50 100755 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,8 @@ idna>=3.11 charset-normalizer>=3.4.6,<4 urllib3>=1.26.20,<2;python_version<"3.10" urllib3>=1.26.20,<3;python_version>="3.10" -requests~=2.32.5 +requests~=2.32.5;python_version<"3.10" +requests~=2.33.0;python_version>="3.10" sniffio==1.3.1 h11==0.16.0 outcome==1.3.0.post0 diff --git a/seleniumbase/__version__.py b/seleniumbase/__version__.py index cc3ab74ee64..70f87b4cebd 100755 --- a/seleniumbase/__version__.py +++ b/seleniumbase/__version__.py @@ -1,2 +1,2 @@ # seleniumbase package -__version__ = "4.47.7" +__version__ = "4.47.8" diff --git a/seleniumbase/core/browser_launcher.py b/seleniumbase/core/browser_launcher.py index 5f3ccf266d5..3fa8bd3addb 100644 --- a/seleniumbase/core/browser_launcher.py +++ b/seleniumbase/core/browser_launcher.py @@ -872,6 +872,7 @@ def uc_open_with_cdp_mode(driver, url=None, **kwargs): cdp.get_gui_element_center = CDPM.get_gui_element_center cdp.get_html = CDPM.get_html cdp.get_page_source = CDPM.get_page_source + cdp.get_beautiful_soup = CDPM.get_beautiful_soup cdp.get_user_agent = CDPM.get_user_agent cdp.get_cookie_string = CDPM.get_cookie_string cdp.get_locale_code = CDPM.get_locale_code diff --git a/seleniumbase/core/sb_cdp.py b/seleniumbase/core/sb_cdp.py index 3a9e7dcc599..2cf63185849 100644 --- a/seleniumbase/core/sb_cdp.py +++ b/seleniumbase/core/sb_cdp.py @@ -117,10 +117,10 @@ def get(self, url, **kwargs): if hasattr(driver, "cdp_base"): driver = driver.cdp_base load_timeout = 60.0 - wait_timeout = 30.0 + wait_timeout = 50.0 if hasattr(sb_config, "_cdp_proxy") and sb_config._cdp_proxy: load_timeout = 90.0 - wait_timeout = 45.0 + wait_timeout = 75.0 try: task = self.page.get(url, **kwargs) self.loop.run_until_complete( @@ -128,6 +128,10 @@ def get(self, url, **kwargs): ) except asyncio.TimeoutError: print("Timeout loading %s" % url) + except RuntimeError: + self.loop.run_until_complete( + self.page.get(url, **kwargs) + ) url_protocol = url.split(":")[0] safe_url = True if url_protocol not in ["about", "data", "chrome"]: @@ -1395,6 +1399,20 @@ def get_page_source(self, include_shadow_dom=True): ) return source + def get_beautiful_soup(self, source=None): + """BeautifulSoup is a toolkit for dissecting an HTML document + and extracting what you need. It's great for screen-scraping! + See: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ + from bs4 import BeautifulSoup + + if not source: + with suppress(Exception): + self.wait_for_element_visible( + "body", timeout=settings.MINI_TIMEOUT + ) + source = self.get_page_source() + return BeautifulSoup(source, "html.parser") + def get_user_agent(self): return self.loop.run_until_complete( self.page.evaluate("navigator.userAgent") diff --git a/seleniumbase/undetected/cdp_driver/cdp_util.py b/seleniumbase/undetected/cdp_driver/cdp_util.py index c38f963d8b7..b66e35143ae 100644 --- a/seleniumbase/undetected/cdp_driver/cdp_util.py +++ b/seleniumbase/undetected/cdp_driver/cdp_util.py @@ -31,6 +31,7 @@ PROXY_DIR_LOCK = proxy_helper.PROXY_DIR_LOCK EXTENSIONS_DIR = os.path.dirname(os.path.realpath(extensions.__file__)) AD_BLOCK_ZIP_PATH = os.path.join(EXTENSIONS_DIR, "ad_block.zip") +DISABLE_CSP_ZIP_PATH = os.path.join(EXTENSIONS_DIR, "disable_csp.zip") T = typing.TypeVar("T") @@ -598,16 +599,6 @@ async def start( proxy_pass, proxy_scheme, ) - if ad_block: - sb_config.ad_block_on = True - incognito = False - guest = False - ad_block_zip = AD_BLOCK_ZIP_PATH - ad_block_dir = os.path.join(DOWNLOADS_FOLDER, "ad_block") - __unzip_to_new_folder(ad_block_zip, ad_block_dir) - extension_dir = __add_chrome_ext_dir(extension_dir, ad_block_dir) - if disable_csp: - sb_config.disable_csp = True if "binary_location" in kwargs and not browser_executable_path: browser_executable_path = kwargs["binary_location"] if not user_data_dir and "--user-data-dir" in arg_join: @@ -624,8 +615,8 @@ async def start( user_data_dir = udd_string if user_data_dir: user_data_dir = os.path.abspath(user_data_dir) + browser = None if not browser_executable_path: - browser = None if "browser" in kwargs: browser = kwargs["browser"] if not browser and "--browser" in arg_join: @@ -675,6 +666,23 @@ async def start( sb_config._cdp_browser = "atlas" else: sb_config._cdp_browser = "chrome" + if ad_block: + sb_config.ad_block_on = True + incognito = False + guest = False + ad_block_zip = AD_BLOCK_ZIP_PATH + ad_block_dir = os.path.join(DOWNLOADS_FOLDER, "ad_block") + __unzip_to_new_folder(ad_block_zip, ad_block_dir) + extension_dir = __add_chrome_ext_dir(extension_dir, ad_block_dir) + if disable_csp: + sb_config.disable_csp = True + if not incognito and not guest: + disable_csp_zip = DISABLE_CSP_ZIP_PATH + disable_csp_dir = os.path.join(DOWNLOADS_FOLDER, "disable_csp") + __unzip_to_new_folder(disable_csp_zip, disable_csp_dir) + extension_dir = __add_chrome_ext_dir( + extension_dir, disable_csp_dir + ) sb_config.incognito = incognito sb_config.guest_mode = guest if not config: diff --git a/seleniumbase/undetected/cdp_driver/connection.py b/seleniumbase/undetected/cdp_driver/connection.py index 196b03e9aa8..920fab3982e 100644 --- a/seleniumbase/undetected/cdp_driver/connection.py +++ b/seleniumbase/undetected/cdp_driver/connection.py @@ -305,20 +305,31 @@ async def wait(self, t: Union[int, float] = None): await self.update_target() loop = asyncio.get_running_loop() start_time = loop.time() - try: - if isinstance(t, (int, float)): - await asyncio.wait_for(self.listener.idle.wait(), timeout=t) - while (loop.time() - start_time) < t: - await asyncio.sleep(0.1) - else: - await self.listener.idle.wait() - except asyncio.TimeoutError: - if isinstance(t, (int, float)): - # Explicit time is given, which is now passed, so leave now. - return - except AttributeError: - # No listener created yet. - pass + with warnings.catch_warnings(): + warnings.filterwarnings( + action="ignore", + category=RuntimeWarning, + message=".*coroutine.*", + ) + try: + if isinstance(t, (int, float)): + try: + await asyncio.wait_for( + self.listener.idle.wait(), timeout=t + ) + except RuntimeError: + await self.listener.idle.wait() + while (loop.time() - start_time) < t: + await asyncio.sleep(0.1) + else: + await self.listener.idle.wait() + except asyncio.TimeoutError: + if isinstance(t, (int, float)): + # Explicit time that's given has passed, so leave now. + return + except AttributeError: + # No listener created yet. + pass async def set_locale(self, locale: Optional[str] = None): """Sets the Language Locale code via set_user_agent_override.""" @@ -423,9 +434,9 @@ async def send( await self.websocket.send(tx.message) with warnings.catch_warnings(): warnings.filterwarnings( - "ignore", - message="coroutine .* was never awaited", + action="ignore", category=RuntimeWarning, + message=".*coroutine.*", ) try: return await tx diff --git a/setup.py b/setup.py index 52d6f309456..690a025a9f2 100755 --- a/setup.py +++ b/setup.py @@ -180,7 +180,8 @@ 'charset-normalizer>=3.4.6,<4', 'urllib3>=1.26.20,<2;python_version<"3.10"', 'urllib3>=1.26.20,<3;python_version>="3.10"', - 'requests~=2.32.5', + 'requests~=2.32.5;python_version<"3.10"', + 'requests~=2.33.0;python_version>="3.10"', 'sniffio==1.3.1', 'h11==0.16.0', 'outcome==1.3.0.post0', @@ -261,7 +262,7 @@ "pdfminer": [ 'pdfminer.six==20251107;python_version<"3.10"', 'pdfminer.six==20260107;python_version>="3.10"', - 'cryptography==46.0.5', + 'cryptography==46.0.6', 'cffi==2.0.0', 'pycparser==2.23;python_version<"3.10"', 'pycparser==3.0;python_version>="3.10"',