diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9939bec02..082d9b69b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,10 +21,16 @@ jobs: # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + # fetch-depth: 0 is required so Docusaurus can read git log per file to + # emit accurate dates in sitemap.xml. Without it every file gets + # the same date (the latest commit) and the Google Indexing diff submits + # every URL on every deploy, burning the full daily quota each time. - uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Set Node.js 22.x - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: 22.x @@ -153,3 +159,74 @@ jobs: echo " curl -sS -X POST \"https://api.indexnow.org/indexnow\" -H 'Content-Type: application/json' \\" echo " -d '{\"host\":\"${INDEXNOW_HOST}\",\"key\":\"${INDEXNOW_KEY}\",\"keyLocation\":\"${INDEXNOW_KEY_LOC}\",\"urlList\":[\"https://${INDEXNOW_HOST}/docs/\"]}'" exit 1 + + # Restore the sitemap from the previous successful deploy so the Google + # Indexing step can diff and only submit URLs that are new since last run. + # key includes github.sha so a fresh entry is saved after every deploy; + # restore-keys provides a fallback to the most-recent previous run. + - name: Restore previous sitemap cache + uses: actions/cache@v4 + with: + path: .sitemap-prev.xml + key: sitemap-prev-${{ github.sha }} + restore-keys: | + sitemap-prev- + + # Submit new/changed URLs to Google Indexing API and ping the GSC sitemap + # endpoint. Uses a cached copy of the previous sitemap (restored above) to + # diff so we only burn quota on pages that actually changed. + # + # Required secret: GOOGLE_SERVICE_ACCOUNT_JSON + # — paste the full JSON key of a Google Cloud service account that has + # been granted "Owner" access in GSC (Search Console → Settings → + # Users and permissions → Add user). + # — The service account also needs the "Indexing API" enabled on its + # Cloud project (APIs & Services → Enable APIs → Web Search Indexing API). + # + # Optional secret: GSC_SITE_URL (default: https://keploy.io/) + # — must match the property URL exactly as it appears in GSC. + - name: Submit changed URLs to Google Indexing API + env: + GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_JSON }} + GSC_SITE_URL: ${{ secrets.GSC_SITE_URL || 'https://keploy.io/' }} + continue-on-error: true + run: | + set -euo pipefail + + SITEMAP="build/docs/sitemap.xml" + if [ ! -f "$SITEMAP" ]; then + echo "::notice::Sitemap not found at $SITEMAP, skipping Google Indexing submission" + exit 0 + fi + + if [ -z "${GOOGLE_SERVICE_ACCOUNT_JSON:-}" ]; then + echo "::notice::GOOGLE_SERVICE_ACCOUNT_JSON secret not set, skipping Google Indexing submission" + exit 0 + fi + + # Install only the auth library — pinned to major version so breaking + # changes in a future release don't silently break this step. + # Captured with || so a registry/network failure doesn't abort under + # set -e before cp runs, which would leave the cache baseline stale. + SCRIPT_EXIT=0 + npm install --no-save google-auth-library@10 || SCRIPT_EXIT=$? + + if [ "$SCRIPT_EXIT" -eq 0 ]; then + node scripts/google-index.js \ + --sitemap "$SITEMAP" \ + --prev-sitemap .sitemap-prev.xml \ + --sitemap-url "https://keploy.io/docs/sitemap.xml" \ + --site-url "${GSC_SITE_URL}" || SCRIPT_EXIT=$? + else + echo "::error::Failed to install google-auth-library@10 — retry the workflow or check npm registry connectivity." + fi + + # Only advance the cached baseline when ALL submissions completed — + # no failures, no quota truncation. If anything was skipped, the + # script exits non-zero and we keep the old baseline so the next + # deploy re-diffs from the same point and picks up the missed URLs. + if [ "$SCRIPT_EXIT" -eq 0 ]; then + cp "$SITEMAP" .sitemap-prev.xml + fi + + exit $SCRIPT_EXIT diff --git a/docusaurus.config.js b/docusaurus.config.js index 1efe37351..8ef5ee2c6 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -450,6 +450,10 @@ module.exports = { changefreq: "weekly", priority: 0.5, filename: "sitemap.xml", + // Emit using the git commit date of each file so the + // Google Indexing API step can diff by date and only resubmit + // pages whose content actually changed since the last deploy. + lastmod: "date", // Differentiate docs sitemap priorities by content type so // search engines spend crawl budget proportional to how // canonical each page is. Priority buckets: diff --git a/scripts/google-index.js b/scripts/google-index.js new file mode 100644 index 000000000..bc71f7509 --- /dev/null +++ b/scripts/google-index.js @@ -0,0 +1,314 @@ +#!/usr/bin/env node +// Submits new/changed docs URLs to Google Indexing API and pings the GSC sitemap endpoint. +// Reads GOOGLE_SERVICE_ACCOUNT_JSON from env; uses a previous-sitemap file for smart diffing +// so only URLs that are new or have a changed date consume quota. +// +// Usage: +// node scripts/google-index.js \ +// --sitemap build/docs/sitemap.xml \ +// --prev-sitemap .sitemap-prev.xml \ +// --sitemap-url https://keploy.io/docs/sitemap.xml +// Add --all to force-submit every URL (ignores prev-sitemap). + +'use strict'; + +const { GoogleAuth } = require('google-auth-library'); +const fs = require('fs'); + +const INDEXING_ENDPOINT = + 'https://indexing.googleapis.com/v3/urlNotifications:publish'; +const GSC_SITEMAPS_API = + 'https://www.googleapis.com/webmasters/v3/sites'; + +// Google's published quota: 200 URL_UPDATED notifications per day (default). +// Burst: up to 10 per second before per-second quota kicks in. +const DAILY_QUOTA = 200; +const BURST_SIZE = 10; +const MAX_RETRIES = 3; + +// ── helpers ─────────────────────────────────────────────────────────────────── + +function parseArgs() { + const argv = process.argv.slice(2); + const get = (flag) => { + const i = argv.indexOf(flag); + return i !== -1 ? argv[i + 1] : null; + }; + return { + sitemap: get('--sitemap') || 'build/docs/sitemap.xml', + prevSitemap: get('--prev-sitemap') || '.sitemap-prev.xml', + sitemapUrl: get('--sitemap-url') || 'https://keploy.io/docs/sitemap.xml', + siteUrl: get('--site-url') || 'https://keploy.io/', + all: argv.includes('--all'), + }; +} + +// Returns Map — lastmod is the raw string from or +// null when the tag is absent. Used for both presence and date diffing. +function parseSitemap(filepath) { + if (!fs.existsSync(filepath)) return new Map(); + const content = fs.readFileSync(filepath, 'utf8'); + const result = new Map(); + // Match each block so loc and lastmod stay paired. + const urlBlocks = content.match(/[\s\S]*?<\/url>/g) || []; + for (const block of urlBlocks) { + const locMatch = block.match(/([^<]+)<\/loc>/); + const lastmodMatch = block.match(/([^<]+)<\/lastmod>/); + if (locMatch) { + result.set(locMatch[1].trim(), lastmodMatch ? lastmodMatch[1].trim() : null); + } + } + return result; +} + +// Mirror the same filters used by the IndexNow step so both pipelines +// submit identical URL sets: no /tags/ pages, no versioned /docs/N.M.P/ paths. +function filterUrl(url) { + return !url.includes('/tags/') && !/\/docs\/\d+\.\d+\.\d+\//.test(url); +} + +function sleep(ms) { + return new Promise((r) => setTimeout(r, ms)); +} + +// ── Google Indexing API ─────────────────────────────────────────────────────── + +// Retries on 429, 5xx, and network errors with exponential backoff. +// Hard 4xx (e.g. 404, 403) are permanent failures — no retry. +async function submitOne(token, url, type) { + let delay = 1000; + for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + try { + const res = await fetch(INDEXING_ENDPOINT, { + method: 'POST', + headers: { + Authorization: `Bearer ${token}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ url, type }), + }); + + if (res.ok) return { ok: true }; + + const body = await res.text().catch(() => ''); + const retryable = res.status === 429 || res.status >= 500; + + if (!retryable || attempt === MAX_RETRIES) { + return { ok: false, status: res.status, body: body.slice(0, 200) }; + } + + console.log(` retry ${attempt}/${MAX_RETRIES} after ${delay}ms (HTTP ${res.status})`); + } catch (err) { + // Network-level error (DNS, connection reset, timeout). + if (attempt === MAX_RETRIES) { + return { ok: false, status: 0, body: err.message }; + } + console.log(` retry ${attempt}/${MAX_RETRIES} after ${delay}ms (${err.message})`); + } + + await sleep(delay); + delay *= 2; + } + // Guard: unreachable with MAX_RETRIES > 0, but prevents implicit undefined return. + return { ok: false, status: 0, body: 'max retries exceeded' }; +} + +async function submitBatch(token, urls, type) { + let ok = 0; + let fail = 0; + + for (let i = 0; i < urls.length; i++) { + const url = urls[i]; + const result = await submitOne(token, url, type); + if (result.ok) { + ok++; + if (i < 5 || i % 10 === 0 || i === urls.length - 1) { + console.log(` [${i + 1}/${urls.length}] ✓ ${url}`); + } + } else { + console.log(` [${i + 1}/${urls.length}] ✗ HTTP ${result.status} — ${url}`); + console.log(` Response: ${result.body}`); + fail++; + } + + // Stay under burst limit: 10 req/s. + if ((i + 1) % BURST_SIZE === 0 && i + 1 < urls.length) { + await sleep(1100); + } + } + + return { ok, fail }; +} + +async function submitUrls(token, updatedUrls, deletedUrls) { + let totalFail = 0; + + // ── URL_UPDATED ───────────────────────────────────────────────────────────── + const toUpdate = updatedUrls.slice(0, DAILY_QUOTA); + const skippedUpdates = updatedUrls.length - toUpdate.length; + + if (skippedUpdates > 0) { + // Count skipped URLs as failures so the script exits non-zero and the + // workflow does NOT advance the cached sitemap baseline. The next deploy + // will re-diff from the same old baseline and pick up all skipped URLs. + totalFail += skippedUpdates; + console.log( + `::warning::${updatedUrls.length} URLs to update but daily quota is ${DAILY_QUOTA}; ` + + `${skippedUpdates} URL(s) will be retried on the next deploy. ` + + `Request a quota increase at console.cloud.google.com.` + ); + } + + if (toUpdate.length > 0) { + console.log(`\nSubmitting ${toUpdate.length} URL_UPDATED notification(s)…`); + const { ok, fail } = await submitBatch(token, toUpdate, 'URL_UPDATED'); + console.log(`URL_UPDATED: ${ok} accepted, ${fail} failed.`); + totalFail += fail; + } else { + console.log('No new/changed URLs to submit (URL_UPDATED).'); + } + + // ── URL_DELETED ───────────────────────────────────────────────────────────── + // Quota for deletions shares the same 200/day pool — only send if there's + // remaining budget after updates. + const deletionBudget = Math.max(0, DAILY_QUOTA - toUpdate.length); + const toDelete = deletedUrls.slice(0, deletionBudget); + const skippedDeletions = deletedUrls.length - toDelete.length; + + if (toDelete.length > 0) { + console.log(`\nSubmitting ${toDelete.length} URL_DELETED notification(s)…`); + const { ok, fail } = await submitBatch(token, toDelete, 'URL_DELETED'); + console.log(`URL_DELETED: ${ok} accepted, ${fail} failed.`); + totalFail += fail; + } + + if (skippedDeletions > 0) { + // Same as above — count as failures so the baseline is not advanced and + // the deleted URLs remain in the cached prev sitemap for the next diff. + totalFail += skippedDeletions; + console.log( + `::warning::${skippedDeletions} deleted URL(s) skipped — quota exhausted. ` + + `They will be signalled on the next deploy.` + ); + } + + return totalFail; +} + +// ── GSC Sitemap ping ────────────────────────────────────────────────────────── + +async function pingSitemap(token, siteUrl, sitemapUrl) { + try { + const endpoint = + `${GSC_SITEMAPS_API}/${encodeURIComponent(siteUrl)}` + + `/sitemaps/${encodeURIComponent(sitemapUrl)}`; + + const res = await fetch(endpoint, { + method: 'PUT', + headers: { Authorization: `Bearer ${token}` }, + }); + + if (res.ok || res.status === 204) { + console.log(`\nGSC sitemap ping: OK (HTTP ${res.status}) — ${sitemapUrl}`); + } else { + const body = await res.text().catch(() => ''); + console.log( + `\n::notice::GSC sitemap ping returned HTTP ${res.status}. Body: ${body.slice(0, 300)}` + ); + } + } catch (err) { + console.log(`\n::notice::GSC sitemap ping failed: ${err.message}`); + } +} + +// ── main ───────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + + const saJson = process.env.GOOGLE_SERVICE_ACCOUNT_JSON; + if (!saJson) { + console.log( + '::error::GOOGLE_SERVICE_ACCOUNT_JSON is not set. ' + + 'Add the service account key JSON as a GitHub secret.' + ); + process.exit(1); + } + + let credentials; + try { + credentials = JSON.parse(saJson); + } catch { + console.log('::error::GOOGLE_SERVICE_ACCOUNT_JSON is not valid JSON. Ensure the secret contains the raw service account JSON (not base64-encoded), with quotes and newlines preserved correctly.'); + process.exit(1); + } + + if (!fs.existsSync(args.sitemap)) { + console.log(`::notice::Sitemap not found at ${args.sitemap} — skipping.`); + process.exit(0); + } + + // Fetch the token once — valid for 1 hour, well beyond the ~22s runtime + // for 200 URLs. No need to call getAccessToken() per request. + const auth = new GoogleAuth({ + credentials, + scopes: [ + 'https://www.googleapis.com/auth/indexing', + 'https://www.googleapis.com/auth/webmasters', + ], + }); + const client = await auth.getClient(); + const { token } = await client.getAccessToken(); + if (!token) { + console.log('::error::Failed to obtain an access token. Check that the service account key is valid and the Indexing API is enabled on its Cloud project.'); + process.exit(1); + } + + const newMap = parseSitemap(args.sitemap); + const prevMap = args.all ? new Map() : parseSitemap(args.prevSitemap); + + const hasPrev = !args.all && fs.existsSync(args.prevSitemap); + if (!hasPrev) { + console.log( + args.all + ? '--all flag set: submitting every URL in the sitemap.' + : 'No previous sitemap cached — submitting all current URLs (first run).' + ); + } else { + console.log(`Prev sitemap: ${prevMap.size} URLs | New sitemap: ${newMap.size} URLs`); + } + + // URLs to update: new URL OR same URL with a different/newer lastmod date. + const updatedUrls = []; + for (const [url, lastmod] of newMap) { + if (!filterUrl(url)) continue; + if (!prevMap.has(url)) { + updatedUrls.push(url); // new page + } else if (lastmod && prevMap.get(url) !== lastmod) { + updatedUrls.push(url); // existing page with updated content + } + } + + // URLs to delete: present in previous sitemap but gone from the new one. + const deletedUrls = []; + for (const [url] of prevMap) { + if (!filterUrl(url)) continue; + if (!newMap.has(url)) { + deletedUrls.push(url); + } + } + + console.log( + `Changed/new: ${updatedUrls.length} | Deleted: ${deletedUrls.length}` + ); + + const failures = await submitUrls(token, updatedUrls, deletedUrls); + await pingSitemap(token, args.siteUrl, args.sitemapUrl); + + if (failures > 0) process.exit(1); +} + +main().catch((err) => { + console.log(`::error::Unhandled error: ${err.message}`); + process.exit(1); +});