summaryrefslogtreecommitdiff
path: root/forge/script/gen-robots-sitemap.sh
blob: 4443faf68e64a939a484f5e2ecd19d72873857d2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env bash
set -euo pipefail

OUT_DIR="${1:-.faircamp_build}"
SITE_URL="${2:-https://st33v.com}"

cd "$OUT_DIR"

# --- robots.txt ---
cat > robots.txt <<EOF
User-agent: *
Allow: /

Sitemap: ${SITE_URL%/}/sitemap.xml
EOF

# --- sitemap.xml ---
# Include HTML pages + common content types; exclude obvious junk.
# If you have multiple languages/hosts, we can expand later.
tmp="$(mktemp)"
find . -type f \( -name '*.html' -o -name '*.pdf' \
  -o -name '*.mp3' -o -name '*.flac' -o -name '*.opus' \) \
  ! -path './.git/*' ! -path './assets/*' ! -path './static/*' \
  -print0 \
| sort -z \
| while IFS= read -r -d '' f; do
    # Turn ./path/index.html into /path/index.html
    path="${f#./}"
    # Basic lastmod (UTC) from file mtime
    lastmod="$(date -u -r "$f" +%Y-%m-%dT%H:%M:%SZ)"
    printf '%s\t%s\n' "$path" "$lastmod"
  done > "$tmp"

{
  printf '%s\n' '<?xml version="1.0" encoding="UTF-8"?>'
  printf '%s\n' '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
  while IFS=$'\t' read -r path lastmod; do
    # Escape ampersands minimally
    url="${SITE_URL%/}/$(printf '%s' "$path" | sed 's/&/\&amp;/g')"
    printf '  <url><loc>%s</loc><lastmod>%s</lastmod></url>\n' "$url" "$lastmod"
  done < "$tmp"
  printf '%s\n' '</urlset>'
} > sitemap.xml

rm -f "$tmp"

echo "Wrote: $OUT_DIR/robots.txt"
echo "Wrote: $OUT_DIR/sitemap.xml"