scrape github release page for changelog

This commit is contained in:
2026-03-13 12:44:06 +00:00
parent 654827d307
commit 0c3de60c20
+60 -17
View File
@@ -70,29 +70,72 @@ release_tag="${release_tag%\}}"
release_tag="${release_tag#\'}" release_tag="${release_tag#\'}"
release_tag="${release_tag%\'}" release_tag="${release_tag%\'}"
api_url="https://api.github.com/repos/${RELEASE_API_REPO}/releases/tags/${release_tag}" release_url="${LATEST_RELEASE_URL%/latest}/tag/${release_tag}"
release_html=$(curl -fsSL "$release_url" || true)
curl_headers=(
-H "Accept: application/vnd.github+json"
-H "X-GitHub-Api-Version: 2022-11-28"
)
if [ -n "${GITHUB_TOKEN:-}" ]; then
curl_headers+=( -H "Authorization: Bearer ${GITHUB_TOKEN}" )
fi
api_response=$(curl -sS -w '\n%{http_code}' "${curl_headers[@]}" "$api_url" || true)
api_body=$(printf '%s\n' "$api_response" | sed '$d')
api_code=$(printf '%s\n' "$api_response" | tail -n1)
release_notes="" release_notes=""
if [ "$api_code" = "200" ]; then if [ -n "$release_html" ]; then
release_notes=$(printf '%s' "$api_body" | "$PYTHON_BIN" -c 'import json,sys; d=json.load(sys.stdin); print((d.get("body") or "").strip())' || true) release_notes=$(printf '%s' "$release_html" | "$PYTHON_BIN" -c '
import re, sys
from html.parser import HTMLParser
html = sys.stdin.read()
m = re.search(r"<div[^>]*data-test-selector=\"body-content\"[^>]*class=\"[^\"]*markdown-body[^\"]*\"[^>]*>(.*?)</div>", html, re.S)
if not m:
print("")
raise SystemExit(0)
fragment = m.group(1)
class Extractor(HTMLParser):
def __init__(self):
super().__init__()
self.out = []
self.in_code = False
def handle_starttag(self, tag, attrs):
if tag in ("br",):
self.out.append("\n")
elif tag in ("p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "li"):
if self.out and not self.out[-1].endswith("\n"):
self.out.append("\n")
elif tag in ("pre",):
if self.out and not self.out[-1].endswith("\n"):
self.out.append("\n")
self.out.append("```\n")
self.in_code = True
elif tag in ("code",) and not self.in_code:
self.out.append("`")
def handle_endtag(self, tag):
if tag in ("p", "div", "li", "h1", "h2", "h3", "h4", "h5", "h6"):
if not self.out or not self.out[-1].endswith("\n"):
self.out.append("\n")
elif tag in ("pre",):
if not self.out or not self.out[-1].endswith("\n"):
self.out.append("\n")
self.out.append("```\n")
self.in_code = False
elif tag in ("code",) and not self.in_code:
self.out.append("`")
def handle_data(self, data):
if data:
self.out.append(data)
parser = Extractor()
parser.feed(fragment)
text = "".join(parser.out)
text = re.sub(r"\n{3,}", "\n\n", text)
print(text.strip())
' || true)
else else
echo "warning: failed to fetch release notes from GitHub API (status=$api_code, url=$api_url)" echo "warning: failed to fetch release page ${release_url}"
fi fi
if [ -z "$release_notes" ]; then if [ -z "$release_notes" ]; then
release_notes="_No changelog found in upstream release notes. Check ${LATEST_RELEASE_URL%/latest}/tag/${release_tag}._" release_notes="_No changelog found on upstream release page. Check ${release_url}._"
fi fi
delimiter="CHANGELOG_$(date +%s%N)" delimiter="CHANGELOG_$(date +%s%N)"