WebHost: Add robots.txt to WebHost (#3157)

* Add a `robots.txt` file to prevent crawlers from scraping the site

* Added `ASSET_RIGHTS` entry to config.yaml to control whether `/robots.txt` is served or not

* Always import robots.py, determine config in route function

* Finish writing a comment

* Remove unnecessary redundant import and config
This commit is contained in:
Chris Wilson
2024-04-20 20:58:56 -04:00
committed by GitHub
parent 915ad61ecf
commit ad4451276d
5 changed files with 44 additions and 5 deletions

View File

@@ -51,6 +51,7 @@ app.config["PONY"] = {
app.config["MAX_ROLL"] = 20
app.config["CACHE_TYPE"] = "SimpleCache"
app.config["HOST_ADDRESS"] = ""
app.config["ASSET_RIGHTS"] = False
cache = Cache()
Compress(app)
@@ -82,6 +83,6 @@ def register():
from WebHostLib.customserver import run_server_process
# to trigger app routing picking up on it
from . import tracker, upload, landing, check, generate, downloads, api, stats, misc
from . import tracker, upload, landing, check, generate, downloads, api, stats, misc, robots
app.register_blueprint(api.api_endpoints)

14
WebHostLib/robots.py Normal file
View File

@@ -0,0 +1,14 @@
from WebHostLib import app
from flask import abort
from . import cache
@cache.cached()
@app.route('/robots.txt')
def robots():
# If this host is not official, do not allow search engine crawling
if not app.config["ASSET_RIGHTS"]:
return app.send_static_file('robots.txt')
# Send 404 if the host has affirmed this to be the official WebHost
abort(404)

View File

@@ -0,0 +1,20 @@
User-agent: Googlebot
Disallow: /
User-agent: APIs-Google
Disallow: /
User-agent: AdsBot-Google-Mobile
Disallow: /
User-agent: AdsBot-Google-Mobile
Disallow: /
User-agent: Mediapartners-Google
Disallow: /
User-agent: Google-Safety
Disallow: /
User-agent: *
Disallow: /