Skip to content

Commit 310fe88

Browse files
gh-79638: Treat an unreachable robots.txt as "disallow all" (GH-138555)
Disallow all access in urllib.robotparser if the robots.txt file is unreachable due to server or network errors.
1 parent fbba343 commit 310fe88

3 files changed

Lines changed: 61 additions & 17 deletions

File tree

Lib/test/test_robotparser.py

Lines changed: 50 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -646,26 +646,23 @@ def test_group_without_user_agent(self):
646646
)
647647
class BaseLocalNetworkTestCase:
648648

649-
def setUp(self):
649+
@classmethod
650+
def setUpClass(cls):
650651
# clear _opener global variable
651-
self.addCleanup(urllib.request.urlcleanup)
652+
cls.addClassCleanup(urllib.request.urlcleanup)
652653

653-
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
654+
cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
655+
cls.addClassCleanup(cls.server.server_close)
654656

655-
self.t = threading.Thread(
657+
t = threading.Thread(
656658
name='HTTPServer serving',
657-
target=self.server.serve_forever,
659+
target=cls.server.serve_forever,
658660
# Short poll interval to make the test finish quickly.
659661
# Time between requests is short enough that we won't wake
660662
# up spuriously too many times.
661663
kwargs={'poll_interval':0.01})
662-
self.t.daemon = True # In case this function raises.
663-
self.t.start()
664-
665-
def tearDown(self):
666-
self.server.shutdown()
667-
self.t.join()
668-
self.server.server_close()
664+
cls.enterClassContext(threading_helper.start_threads([t]))
665+
cls.addClassCleanup(cls.server.shutdown)
669666

670667

671668
SAMPLE_ROBOTS_TXT = b'''\
@@ -687,7 +684,6 @@ def do_GET(self):
687684
def log_message(self, format, *args):
688685
pass
689686

690-
@threading_helper.reap_threads
691687
def testRead(self):
692688
# Test that reading a weird robots.txt doesn't fail.
693689
addr = self.server.server_address
@@ -709,24 +705,62 @@ def testRead(self):
709705
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
710706

711707

712-
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
708+
class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
713709
class RobotHandler(BaseHTTPRequestHandler):
714710

715711
def do_GET(self):
716-
self.send_error(403, "Forbidden access")
712+
self.send_error(self.server.return_code)
717713

718714
def log_message(self, format, *args):
719715
pass
720716

721-
@threading_helper.reap_threads
717+
def setUp(self):
718+
# Make sure that a valid code is set in the test.
719+
self.server.return_code = None
720+
722721
def testPasswordProtectedSite(self):
722+
self.server.return_code = 403
723723
addr = self.server.server_address
724724
url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
725725
robots_url = url + "/robots.txt"
726726
parser = urllib.robotparser.RobotFileParser()
727727
parser.set_url(url)
728728
parser.read()
729729
self.assertFalse(parser.can_fetch("*", robots_url))
730+
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
731+
732+
def testNotFound(self):
733+
self.server.return_code = 404
734+
addr = self.server.server_address
735+
url = f'http://{socket_helper.HOST}:{addr[1]}'
736+
robots_url = url + "/robots.txt"
737+
parser = urllib.robotparser.RobotFileParser()
738+
parser.set_url(url)
739+
parser.read()
740+
self.assertTrue(parser.can_fetch("*", robots_url))
741+
self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))
742+
743+
def testTeapot(self):
744+
self.server.return_code = 418
745+
addr = self.server.server_address
746+
url = f'http://{socket_helper.HOST}:{addr[1]}'
747+
robots_url = url + "/robots.txt"
748+
parser = urllib.robotparser.RobotFileParser()
749+
parser.set_url(url)
750+
parser.read()
751+
self.assertTrue(parser.can_fetch("*", robots_url))
752+
self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))
753+
754+
def testServiceUnavailable(self):
755+
self.server.return_code = 503
756+
addr = self.server.server_address
757+
url = f'http://{socket_helper.HOST}:{addr[1]}'
758+
robots_url = url + "/robots.txt"
759+
parser = urllib.robotparser.RobotFileParser()
760+
parser.set_url(url)
761+
parser.read()
762+
self.assertFalse(parser.can_fetch("*", robots_url))
763+
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))
730764

731765

732766
@support.requires_working_socket()

Lib/urllib/robotparser.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,17 @@ def read(self):
6565
f = urllib.request.urlopen(self.url)
6666
except urllib.error.HTTPError as err:
6767
if err.code in (401, 403):
68+
# If access to robot.txt has the status Unauthorized/Forbidden,
69+
# then most likely this applies to the entire site.
6870
self.disallow_all = True
69-
elif err.code >= 400 and err.code < 500:
71+
elif 400 <= err.code < 500:
72+
# RFC 9309, Section 2.3.1.3: the crawler MAY access any
73+
# resources on the server.
7074
self.allow_all = True
75+
elif 500 <= err.code < 600:
76+
# RFC 9309, Section 2.3.1.4: the crawler MUST assume
77+
# complete disallow.
78+
self.disallow_all = True
7179
err.close()
7280
else:
7381
raw = f.read()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file
2+
is unreachable due to server or network errors.

0 commit comments

Comments
 (0)