<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Crawling on emsenn.net</title>
    <link>https://emsenn.net/tags/crawling/</link>
    <description>Recent content in Crawling on emsenn.net</description>
    <generator>Hugo</generator>
    <language>en-us</language>
    <lastBuildDate>Tue, 03 Mar 2026 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://emsenn.net/tags/crawling/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>robots.txt</title>
      <link>https://emsenn.net/library/domains/engineering/domains/tech/domains/computing/domains/internet/robots-txt/</link>
      <pubDate>Tue, 03 Mar 2026 00:00:00 +0000</pubDate>
      <guid>https://emsenn.net/library/domains/engineering/domains/tech/domains/computing/domains/internet/robots-txt/</guid>
      <description>&lt;p&gt;&lt;code&gt;robots.txt&lt;/code&gt; is a plain-text file placed at the root of a website (e.g., &lt;code&gt;https://example.com/robots.txt&lt;/code&gt;) that tells web crawlers which parts of the site they may or may not access. It follows the Robots Exclusion Protocol, first proposed by Martijn Koster in 1994 and codified as an internet standard in RFC 9309 (2022).&lt;/p&gt;&#xA;&lt;h2 id=&#34;how-it-works&#34;&gt;How it works&lt;/h2&gt;&#xA;&lt;p&gt;A &lt;code&gt;robots.txt&lt;/code&gt; file consists of one or more records, each specifying a user-agent (the crawler&amp;rsquo;s identifier) and a set of &lt;code&gt;Allow&lt;/code&gt; and &lt;code&gt;Disallow&lt;/code&gt; directives. Crawlers are expected to fetch this file before crawling any other page and to respect its directives, though compliance is voluntary — &lt;code&gt;robots.txt&lt;/code&gt; is a convention, not an access control mechanism.&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
