From 75d0c43a823d128b7f8507d3d86957f77ae9a60d Mon Sep 17 00:00:00 2001 From: rainxchzed Date: Thu, 21 May 2026 13:56:53 +0500 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20add=20pushedAt=20field=20(R5/R13)?= =?UTF-8?q?=20=E2=80=94=20pipe=20GitHub=20pushed=5Fat=20through=20all=20la?= =?UTF-8?q?yers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - V18 migration: add pushed_at_gh TIMESTAMPTZ column to repos - Expose pushedAt in RepoResponse (distinct from updatedAt/metadata change) - Persist in GitHubSearchClient ingest, upsertMetadataOnly, and Meili sync - Map in RepoRepository, SearchRepository, MeiliRepoHit, all route mappers - Add POST /internal/backfill-pushed-at to fill NULL rows on existing data Co-Authored-By: Oz --- .../rainxch/githubstore/db/DatabaseFactory.kt | 1 + .../githubstore/db/MeilisearchClient.kt | 2 + .../rainxch/githubstore/db/RepoRepository.kt | 1 + .../githubstore/db/SearchRepository.kt | 3 +- .../zed/rainxch/githubstore/db/Tables.kt | 3 ++ .../githubstore/ingest/GitHubSearchClient.kt | 7 +++ .../rainxch/githubstore/model/RepoResponse.kt | 4 ++ .../githubstore/routes/InternalRoutes.kt | 54 ++++++++++++++++++- .../rainxch/githubstore/routes/RepoRoutes.kt | 1 + .../githubstore/routes/SearchRoutes.kt | 1 + .../resources/db/migration/V18__pushed_at.sql | 4 ++ 11 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 src/main/resources/db/migration/V18__pushed_at.sql diff --git a/src/main/kotlin/zed/rainxch/githubstore/db/DatabaseFactory.kt b/src/main/kotlin/zed/rainxch/githubstore/db/DatabaseFactory.kt index 35e5b8f..8b93f1a 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/db/DatabaseFactory.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/db/DatabaseFactory.kt @@ -96,6 +96,7 @@ object DatabaseFactory { "V15__license_info.sql", "V16__oauth_ephemeral.sql", "V17__signing_fingerprint_host.sql", + "V18__pushed_at.sql", ) for (migration in migrations) { val rawSql = this::class.java.classLoader diff --git a/src/main/kotlin/zed/rainxch/githubstore/db/MeilisearchClient.kt b/src/main/kotlin/zed/rainxch/githubstore/db/MeilisearchClient.kt index a01b70d..a7de8a1 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/db/MeilisearchClient.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/db/MeilisearchClient.kt @@ -152,6 +152,8 @@ data class MeiliRepoHit( val has_installers_linux: Boolean = false, val trending_score: Double? = null, val popularity_score: Double? = null, + // R5/R13: last commit timestamp; piped from GitHub pushed_at. + val pushed_at: String? = null, // Must be populated on every addDocuments() call — Meili's POST /documents // *replaces* the doc, so omitting this field wipes the SignalAggregationWorker's // most recent score. Null here is "no signal yet," not "no longer ranked." diff --git a/src/main/kotlin/zed/rainxch/githubstore/db/RepoRepository.kt b/src/main/kotlin/zed/rainxch/githubstore/db/RepoRepository.kt index c816249..96b25f3 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/db/RepoRepository.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/db/RepoRepository.kt @@ -95,6 +95,7 @@ class RepoRepository { releasesUrl = "${this[Repos.htmlUrl]}/releases", updatedAt = this[Repos.updatedAtGh]?.toString(), createdAt = this[Repos.createdAtGh]?.toString(), + pushedAt = this[Repos.pushedAtGh]?.toString(), latestReleaseDate = releaseDateStr, latestReleaseTag = this[Repos.latestReleaseTag], releaseRecency = recencyDays, diff --git a/src/main/kotlin/zed/rainxch/githubstore/db/SearchRepository.kt b/src/main/kotlin/zed/rainxch/githubstore/db/SearchRepository.kt index 56e5802..d55a4a8 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/db/SearchRepository.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/db/SearchRepository.kt @@ -55,7 +55,7 @@ class SearchRepository { has_installers_android, has_installers_windows, has_installers_macos, has_installers_linux, trending_score, popularity_score, search_score, - updated_at_gh, created_at_gh + updated_at_gh, created_at_gh, pushed_at_gh FROM repos """.trimIndent() ) @@ -115,6 +115,7 @@ class SearchRepository { releasesUrl = "${rs.getString("html_url")}/releases", updatedAt = rs.getString("updated_at_gh"), createdAt = rs.getString("created_at_gh"), + pushedAt = rs.getString("pushed_at_gh"), latestReleaseDate = releaseDateStr, latestReleaseTag = rs.getString("latest_release_tag"), releaseRecency = recencyDays, diff --git a/src/main/kotlin/zed/rainxch/githubstore/db/Tables.kt b/src/main/kotlin/zed/rainxch/githubstore/db/Tables.kt index fad5c63..675690b 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/db/Tables.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/db/Tables.kt @@ -36,6 +36,9 @@ object Repos : Table("repos") { val searchScore = float("search_score").nullable() val createdAtGh = timestampWithTimeZone("created_at_gh").nullable() val updatedAtGh = timestampWithTimeZone("updated_at_gh").nullable() + // R5/R13: last default-branch commit (GitHub pushed_at), distinct from + // updatedAtGh (last metadata change). Used by client Heartbeat animation. + val pushedAtGh = timestampWithTimeZone("pushed_at_gh").nullable() val indexedAt = timestampWithTimeZone("indexed_at") override val primaryKey = PrimaryKey(id) diff --git a/src/main/kotlin/zed/rainxch/githubstore/ingest/GitHubSearchClient.kt b/src/main/kotlin/zed/rainxch/githubstore/ingest/GitHubSearchClient.kt index 9c05764..61769cc 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/ingest/GitHubSearchClient.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/ingest/GitHubSearchClient.kt @@ -539,6 +539,9 @@ class GitHubSearchClient( it[hasInstallersLinux] = platforms["linux"] ?: false it[downloadCount] = r.downloadCount it[searchScore] = scoreToWrite + it[pushedAtGh] = repo.pushedAt?.let { + try { OffsetDateTime.parse(it) } catch (_: Exception) { null } + } it[indexedAt] = OffsetDateTime.now() } scoredByRepoId[repo.id] = scoreToWrite.toDouble() @@ -576,6 +579,7 @@ class GitHubSearchClient( has_installers_windows = r.platformFlags["windows"] ?: false, has_installers_macos = r.platformFlags["macos"] ?: false, has_installers_linux = r.platformFlags["linux"] ?: false, + pushed_at = r.repo.pushedAt, // Meili's POST /documents replaces the whole doc. Omitting this // would wipe the SignalAggregationWorker's most recent score // on every passthrough/refresh until the next hourly cycle. @@ -622,6 +626,7 @@ class GitHubSearchClient( releasesUrl = "${repo.htmlUrl}/releases", updatedAt = repo.updatedAt, createdAt = repo.createdAt, + pushedAt = repo.pushedAt, latestReleaseDate = releaseDateStr, latestReleaseTag = release.tagName, releaseRecency = recencyDays, @@ -682,6 +687,8 @@ data class GitHubRepo( val disabled: Boolean = false, @SerialName("updated_at") val updatedAt: String? = null, @SerialName("created_at") val createdAt: String? = null, + // R5/R13: last default-branch commit, distinct from updated_at (metadata change). + @SerialName("pushed_at") val pushedAt: String? = null, ) @Serializable diff --git a/src/main/kotlin/zed/rainxch/githubstore/model/RepoResponse.kt b/src/main/kotlin/zed/rainxch/githubstore/model/RepoResponse.kt index b1ac4d3..812e447 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/model/RepoResponse.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/model/RepoResponse.kt @@ -49,6 +49,10 @@ data class RepoResponse( val releasesUrl: String?, val updatedAt: String?, val createdAt: String?, + // R5/R13: last commit timestamp (GitHub pushed_at = default-branch HEAD). + // Distinct from updatedAt (last metadata change). Null for Meili-served + // search results until meili_sync.py backfills the field. + val pushedAt: String? = null, val latestReleaseDate: String? = null, val latestReleaseTag: String? = null, val releaseRecency: Int? = null, diff --git a/src/main/kotlin/zed/rainxch/githubstore/routes/InternalRoutes.kt b/src/main/kotlin/zed/rainxch/githubstore/routes/InternalRoutes.kt index bc1d680..8df49b0 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/routes/InternalRoutes.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/routes/InternalRoutes.kt @@ -16,6 +16,7 @@ import kotlinx.coroutines.launch import kotlinx.serialization.Serializable import org.jetbrains.exposed.sql.SqlExpressionBuilder.eq import org.jetbrains.exposed.sql.SqlExpressionBuilder.isNull +import java.time.OffsetDateTime import org.jetbrains.exposed.sql.selectAll import org.jetbrains.exposed.sql.transactions.TransactionManager import org.jetbrains.exposed.sql.transactions.experimental.newSuspendedTransaction @@ -160,6 +161,54 @@ fun Route.internalRoutes( ) } + // One-shot backfill for pushed_at_gh (V18). Iterates every repo + // where pushed_at_gh IS NULL, re-fetches from GitHub, and writes + // the field. Self-terminates once all rows are filled. Shares the + // same backfillRunning gate as /backfill-stale so the two never + // run concurrently and don't race the rotation pool. + post("/backfill-pushed-at") { + if (!authorized(call, adminToken)) { + return@post respondNotFound(call) + } + val limit = call.request.queryParameters["limit"] + ?.toIntOrNull() + ?.coerceIn(1, 10_000) + ?: 10_000 + if (!backfillRunning.compareAndSet(false, true)) { + call.response.header(HttpHeaders.RetryAfter, "60") + return@post call.respond( + HttpStatusCode.Conflict, + BackfillResponse(scheduled = 0, started = false, message = "backfill_already_running"), + ) + } + val candidates = transaction { + Repos.selectAll() + .where { Repos.pushedAtGh.isNull() } + .orderBy(Repos.id) + .limit(limit) + .map { it[Repos.id] to it[Repos.fullName] } + } + if (candidates.isEmpty()) { + backfillRunning.set(false) + return@post call.respond( + HttpStatusCode.OK, + BackfillResponse(scheduled = 0, started = false, message = "no rows missing pushed_at"), + ) + } + backfillScope.launch { + try { + runBackfill(searchClient, candidates) + } finally { + backfillRunning.set(false) + } + } + call.response.header(HttpHeaders.CacheControl, "no-store") + call.respond( + HttpStatusCode.Accepted, + BackfillResponse(scheduled = candidates.size, started = true), + ) + } + // Browser dashboard. Basic Auth required in prod so the browser prompts // for credentials on first visit; optional in dev for local inspection. authenticate(ADMIN_BASIC_AUTH, optional = adminToken == null) { @@ -259,7 +308,10 @@ private fun upsertMetadataOnly(repo: GitHubRepo) { it[licenseSpdxId] = repo.license?.spdxId it[licenseName] = repo.license?.name it[description] = repo.description - it[indexedAt] = java.time.OffsetDateTime.now() + it[pushedAtGh] = repo.pushedAt?.let { raw -> + try { OffsetDateTime.parse(raw) } catch (_: Exception) { null } + } + it[indexedAt] = OffsetDateTime.now() } } } diff --git a/src/main/kotlin/zed/rainxch/githubstore/routes/RepoRoutes.kt b/src/main/kotlin/zed/rainxch/githubstore/routes/RepoRoutes.kt index b2d754b..46ec49b 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/routes/RepoRoutes.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/routes/RepoRoutes.kt @@ -143,4 +143,5 @@ internal fun GitHubRepo.toMetadataOnlyResponse(): RepoResponse = RepoResponse( releasesUrl = "$htmlUrl/releases", updatedAt = updatedAt, createdAt = createdAt, + pushedAt = pushedAt, ) diff --git a/src/main/kotlin/zed/rainxch/githubstore/routes/SearchRoutes.kt b/src/main/kotlin/zed/rainxch/githubstore/routes/SearchRoutes.kt index 3d84262..cf7f21b 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/routes/SearchRoutes.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/routes/SearchRoutes.kt @@ -292,6 +292,7 @@ private fun zed.rainxch.githubstore.db.MeiliRepoHit.toRepoResponse() = RepoRespo releasesUrl = "$html_url/releases", updatedAt = null, createdAt = null, + pushedAt = pushed_at, latestReleaseDate = latest_release_date, latestReleaseTag = latest_release_tag, downloadCount = download_count, diff --git a/src/main/resources/db/migration/V18__pushed_at.sql b/src/main/resources/db/migration/V18__pushed_at.sql new file mode 100644 index 0000000..cbc4221 --- /dev/null +++ b/src/main/resources/db/migration/V18__pushed_at.sql @@ -0,0 +1,4 @@ +-- R5/R13: Add pushed_at_gh to distinguish last-commit timestamp (GitHub's +-- pushed_at / default-branch HEAD) from updated_at_gh (last metadata change). +-- Clients use this for the Heartbeat animation period. +ALTER TABLE repos ADD COLUMN pushed_at_gh TIMESTAMPTZ; From 355b8e1c0014c9ebe0db9657a738c4098d81561c Mon Sep 17 00:00:00 2001 From: rainxchzed Date: Thu, 21 May 2026 14:09:08 +0500 Subject: [PATCH 2/3] fix: backfill-pushed-at infinite loop for Archived/Gone repos Gone and Archived branches in runBackfill never wrote pushed_at_gh, so those rows kept appearing in the pushed_at_gh IS NULL filter on every subsequent invocation. Fix: call markPushedAtFallback() for both cases, which stamps COALESCE(updated_at_gh, indexed_at) as a proxy so they are never reconsidered. TransientFailure is intentionally left NULL to be retried. Update endpoint comment to accurately describe termination semantics per outcome branch. Co-Authored-By: Oz --- .../githubstore/routes/InternalRoutes.kt | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/src/main/kotlin/zed/rainxch/githubstore/routes/InternalRoutes.kt b/src/main/kotlin/zed/rainxch/githubstore/routes/InternalRoutes.kt index 8df49b0..b180183 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/routes/InternalRoutes.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/routes/InternalRoutes.kt @@ -163,9 +163,13 @@ fun Route.internalRoutes( // One-shot backfill for pushed_at_gh (V18). Iterates every repo // where pushed_at_gh IS NULL, re-fetches from GitHub, and writes - // the field. Self-terminates once all rows are filled. Shares the - // same backfillRunning gate as /backfill-stale so the two never - // run concurrently and don't race the rotation pool. + // the field. Terminates once all rows are filled: + // - Ok / NoUsableRelease: real pushed_at from GitHub + // - Archived / Gone: COALESCE(updated_at_gh, indexed_at) as proxy + // so these rows are never reconsidered on subsequent runs + // - TransientFailure: left NULL, re-tried on next invocation + // Shares the same backfillRunning gate as /backfill-stale so the + // two never run concurrently and don't race the rotation pool. post("/backfill-pushed-at") { if (!authorized(call, adminToken)) { return@post respondNotFound(call) @@ -281,8 +285,17 @@ private suspend fun runBackfill( upsertMetadataOnly(result.repo) metadataOnly++ } - GitHubSearchClient.RefreshResult.Gone -> gone++ - GitHubSearchClient.RefreshResult.Archived -> archived++ + GitHubSearchClient.RefreshResult.Gone -> { + // Repo deleted on GitHub — stamp with existing data so it + // doesn't reappear in pushed_at_gh IS NULL queries forever. + markPushedAtFallback(fullName) + gone++ + } + GitHubSearchClient.RefreshResult.Archived -> { + // Repo archived — same stamping rationale as Gone. + markPushedAtFallback(fullName) + archived++ + } GitHubSearchClient.RefreshResult.TransientFailure -> failed++ } delay(pacePerRepoMs) @@ -316,6 +329,25 @@ private fun upsertMetadataOnly(repo: GitHubRepo) { } } +// For Gone/Archived repos we have no live pushed_at from GitHub. +// Use the best available proxy from existing data so the row stops +// appearing in the pushed_at_gh IS NULL filter on future backfill runs. +private fun markPushedAtFallback(fullName: String) { + transaction { + val conn = TransactionManager.current().connection.connection as java.sql.Connection + conn.prepareStatement( + """ + UPDATE repos + SET pushed_at_gh = COALESCE(updated_at_gh, indexed_at) + WHERE full_name = ? AND pushed_at_gh IS NULL + """.trimIndent() + ).use { ps -> + ps.setString(1, fullName) + ps.executeUpdate() + } + } +} + private suspend fun fetchDbMetrics(): TrainingMetrics = coroutineScope { val unprocessed = async { countUnprocessedMisses() } val reposWithSignals = async { countReposWithSignals() } From 9067da92fd1f8dd522d2893b48fca713526ce631 Mon Sep 17 00:00:00 2001 From: rainxchzed Date: Thu, 21 May 2026 15:00:35 +0500 Subject: [PATCH 3/3] =?UTF-8?q?feat:=20add=20topicCodes=20field=20?= =?UTF-8?q?=E2=80=94=20normalize=20raw=20GitHub=20topics=20to=2015=20canon?= =?UTF-8?q?ical=20codes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TopicCodeMapper resolves raw GitHub topic strings to canonical codes (ai, privacy, security, networking, messaging, browser, social, launcher, notes, reader, audio, video, photo, backup, self-hosted). Derived at response-construction time from existing topics field — no DB migration, no Meili change. All 5 RepoResponse mappers updated. Frontend can render up to 3 codes as TopicGlyph icons. Replaces the current hardcoded 12-glyph set + alias map with backend-driven normalization covering the actual FOSS app taxonomy in the catalog. Co-Authored-By: Oz --- .../rainxch/githubstore/db/RepoRepository.kt | 2 + .../githubstore/db/SearchRepository.kt | 2 + .../githubstore/ingest/GitHubSearchClient.kt | 2 + .../rainxch/githubstore/model/RepoResponse.kt | 6 + .../rainxch/githubstore/routes/RepoRoutes.kt | 2 + .../githubstore/routes/SearchRoutes.kt | 2 + .../githubstore/topics/TopicCodeMapper.kt | 153 ++++++++++++++++++ 7 files changed, 169 insertions(+) create mode 100644 src/main/kotlin/zed/rainxch/githubstore/topics/TopicCodeMapper.kt diff --git a/src/main/kotlin/zed/rainxch/githubstore/db/RepoRepository.kt b/src/main/kotlin/zed/rainxch/githubstore/db/RepoRepository.kt index 96b25f3..76b9067 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/db/RepoRepository.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/db/RepoRepository.kt @@ -5,6 +5,7 @@ import org.jetbrains.exposed.sql.* import org.jetbrains.exposed.sql.transactions.experimental.newSuspendedTransaction import zed.rainxch.githubstore.model.RepoOwner import zed.rainxch.githubstore.model.RepoResponse +import zed.rainxch.githubstore.topics.TopicCodeMapper import zed.rainxch.githubstore.util.formatRecency import java.time.OffsetDateTime import java.time.temporal.ChronoUnit @@ -92,6 +93,7 @@ class RepoRepository { license = nestedLicense(this[Repos.licenseSpdxId], this[Repos.licenseName]), language = this[Repos.language], topics = this[Repos.topics], + topicCodes = TopicCodeMapper.resolve(this[Repos.topics]), releasesUrl = "${this[Repos.htmlUrl]}/releases", updatedAt = this[Repos.updatedAtGh]?.toString(), createdAt = this[Repos.createdAtGh]?.toString(), diff --git a/src/main/kotlin/zed/rainxch/githubstore/db/SearchRepository.kt b/src/main/kotlin/zed/rainxch/githubstore/db/SearchRepository.kt index d55a4a8..0883b6c 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/db/SearchRepository.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/db/SearchRepository.kt @@ -5,6 +5,7 @@ import org.jetbrains.exposed.sql.transactions.TransactionManager import org.jetbrains.exposed.sql.transactions.experimental.newSuspendedTransaction import zed.rainxch.githubstore.model.RepoOwner import zed.rainxch.githubstore.model.RepoResponse +import zed.rainxch.githubstore.topics.TopicCodeMapper import zed.rainxch.githubstore.util.formatRecency import java.sql.Array as SqlArray import java.time.OffsetDateTime @@ -112,6 +113,7 @@ class SearchRepository { license = nestedLicense(rs.getString("license_spdx_id"), rs.getString("license_name")), language = rs.getString("language"), topics = topics, + topicCodes = TopicCodeMapper.resolve(topics), releasesUrl = "${rs.getString("html_url")}/releases", updatedAt = rs.getString("updated_at_gh"), createdAt = rs.getString("created_at_gh"), diff --git a/src/main/kotlin/zed/rainxch/githubstore/ingest/GitHubSearchClient.kt b/src/main/kotlin/zed/rainxch/githubstore/ingest/GitHubSearchClient.kt index 61769cc..d2fa28d 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/ingest/GitHubSearchClient.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/ingest/GitHubSearchClient.kt @@ -29,6 +29,7 @@ import zed.rainxch.githubstore.db.Repos import zed.rainxch.githubstore.model.RepoOwner import zed.rainxch.githubstore.model.RepoResponse import zed.rainxch.githubstore.ranking.SearchScore +import zed.rainxch.githubstore.topics.TopicCodeMapper import zed.rainxch.githubstore.util.FeatureFlags import zed.rainxch.githubstore.util.formatRecency import zed.rainxch.githubstore.util.queryHash @@ -623,6 +624,7 @@ class GitHubSearchClient( license = repo.license?.let { zed.rainxch.githubstore.model.RepoLicense(spdxId = it.spdxId, name = it.name) }, language = repo.language, topics = repo.topics, + topicCodes = TopicCodeMapper.resolve(repo.topics), releasesUrl = "${repo.htmlUrl}/releases", updatedAt = repo.updatedAt, createdAt = repo.createdAt, diff --git a/src/main/kotlin/zed/rainxch/githubstore/model/RepoResponse.kt b/src/main/kotlin/zed/rainxch/githubstore/model/RepoResponse.kt index 812e447..a6a9ef1 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/model/RepoResponse.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/model/RepoResponse.kt @@ -46,6 +46,12 @@ data class RepoResponse( val license: RepoLicense? = null, val language: String?, val topics: List, + // Canonical topic codes derived from raw topics via TopicCodeMapper. + // 15 possible codes (ai, privacy, security, networking, messaging, browser, + // social, launcher, notes, reader, audio, video, photo, backup, self-hosted). + // Empty when no raw topic matches the canonical set. Frontend renders up to 3 + // as TopicGlyph icons. Never stored in DB or Meili — computed at response time. + val topicCodes: List = emptyList(), val releasesUrl: String?, val updatedAt: String?, val createdAt: String?, diff --git a/src/main/kotlin/zed/rainxch/githubstore/routes/RepoRoutes.kt b/src/main/kotlin/zed/rainxch/githubstore/routes/RepoRoutes.kt index 46ec49b..45bba66 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/routes/RepoRoutes.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/routes/RepoRoutes.kt @@ -11,6 +11,7 @@ import zed.rainxch.githubstore.ingest.GitHubResourceClient import zed.rainxch.githubstore.match.ForgejoResourceClient import zed.rainxch.githubstore.model.RepoOwner import zed.rainxch.githubstore.model.RepoResponse +import zed.rainxch.githubstore.topics.TopicCodeMapper import zed.rainxch.githubstore.util.GitHubIdentifiers private val log = LoggerFactory.getLogger("RepoRoutes") @@ -140,6 +141,7 @@ internal fun GitHubRepo.toMetadataOnlyResponse(): RepoResponse = RepoResponse( license = license?.let { zed.rainxch.githubstore.model.RepoLicense(spdxId = it.spdxId, name = it.name) }, language = language, topics = topics, + topicCodes = TopicCodeMapper.resolve(topics), releasesUrl = "$htmlUrl/releases", updatedAt = updatedAt, createdAt = createdAt, diff --git a/src/main/kotlin/zed/rainxch/githubstore/routes/SearchRoutes.kt b/src/main/kotlin/zed/rainxch/githubstore/routes/SearchRoutes.kt index cf7f21b..76fdbf1 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/routes/SearchRoutes.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/routes/SearchRoutes.kt @@ -13,6 +13,7 @@ import zed.rainxch.githubstore.model.ExploreResponse import zed.rainxch.githubstore.model.RepoOwner import zed.rainxch.githubstore.model.RepoResponse import zed.rainxch.githubstore.model.SearchResponse +import zed.rainxch.githubstore.topics.TopicCodeMapper private val VALID_PLATFORMS = setOf("android", "windows", "macos", "linux") // `recent` kept for back-compat; `releases` is the public-facing alias. @@ -289,6 +290,7 @@ private fun zed.rainxch.githubstore.db.MeiliRepoHit.toRepoResponse() = RepoRespo license = zed.rainxch.githubstore.db.nestedLicense(license_spdx_id, license_name), language = language, topics = topics, + topicCodes = TopicCodeMapper.resolve(topics), releasesUrl = "$html_url/releases", updatedAt = null, createdAt = null, diff --git a/src/main/kotlin/zed/rainxch/githubstore/topics/TopicCodeMapper.kt b/src/main/kotlin/zed/rainxch/githubstore/topics/TopicCodeMapper.kt new file mode 100644 index 0000000..4265259 --- /dev/null +++ b/src/main/kotlin/zed/rainxch/githubstore/topics/TopicCodeMapper.kt @@ -0,0 +1,153 @@ +package zed.rainxch.githubstore.topics + +/** + * Maps raw GitHub topic strings to canonical topic codes. + * + * 15 codes, chosen from frequency analysis of 11k+ repos in our index + * plus F-Droid category taxonomy. Excludes programming languages, OS tags, + * and build tooling — only app-category concepts. + * + * Call [resolve] with a repo's raw topics list; returns all matching + * canonical codes in priority order (most distinctive first). The frontend + * renders up to 3 as TopicGlyph icons. + */ +object TopicCodeMapper { + + /** + * Returns canonical topic codes that match the given raw GitHub topics. + * Order is deterministic (priority order defined below), duplicates removed. + */ + fun resolve(topics: List): List { + if (topics.isEmpty()) return emptyList() + val lower = topics.mapTo(mutableSetOf()) { it.lowercase() } + return PRIORITY_ORDER.filter { code -> + MAPPINGS.getValue(code).any { it in lower } + } + } + + // ── Canonical codes → raw GitHub topic aliases ───────────────────────── + + private val MAPPINGS: Map> = mapOf( + + // User intent: protect identity / stop tracking — broader principle + "privacy" to setOf( + "privacy", "privacy-tools", "privacy-focused", "anonymity", + "no-telemetry", "tracking-protection", "degoogle", "anti-tracking", + "tracker-blocker", "ungoogled", "de-google", + ), + + // User intent: harden secrets / authenticate — specific mechanism + "security" to setOf( + "security", "encryption", "2fa", "totp", "otp", "pgp", "gpg", + "e2ee", "end-to-end-encryption", "password-manager", "authenticator", + "cryptography", "cipher", "keystore", "biometric", + ), + + // User intent: route / tunnel / block traffic at the network layer + "networking" to setOf( + "vpn", "proxy", "shadowsocks", "v2ray", "xray", "vless", "vmess", + "trojan", "sing-box", "clash", "hysteria", "wireguard", + "dns", "ad-blocker", "adblock", "adblocker", "firewall", + "p2p", "torrent", "downloader", "download-manager", "network", + "ssh", "socks5", "http-proxy", "tor", + ), + + // User intent: interact with AI models / agents + "ai" to setOf( + "ai", "artificial-intelligence", "chatgpt", "llm", "large-language-model", + "mcp", "agent", "ai-agent", "gemini", "deepseek", "openai", + "ollama", "claude", "copilot", "gpt", "local-llm", "on-device-ai", + ), + + // User intent: capture and organise ideas / tasks + "notes" to setOf( + "note-taking", "notes-app", "notes", "note", "note-app", + "markdown", "knowledge-base", "pkm", "second-brain", "zettelkasten", + "todo", "task-manager", "tasks", "to-do", "journal", "diary", + "writing", "text-editor", "notetaking", "productivity", + "local-first", "offline-first", + ), + + // User intent: listen to music / podcasts / radio + "audio" to setOf( + "music-player", "music", "podcast", "podcasts", "radio", + "audio", "audio-player", "mpd", "scrobbler", + ), + + // User intent: watch video / streams + "video" to setOf( + "video-player", "video", "streaming", "youtube", "iptv", + "media-player", "danmaku", "online-video", "video-streaming", + ), + + // User intent: manage or view images / camera + "photo" to setOf( + "photo", "photos", "gallery", "camera", "image-viewer", + "google-photos-alternative", "image-gallery", "photo-gallery", + "screenshots", + ), + + // User intent: read long-form content offline + "reader" to setOf( + "ebook", "e-reader", "epub", "pdf", "djvu", "cbz", "cbr", + "book", "manga", "comic", "comics", "rss", "rss-reader", + "feed-reader", "reading", + ), + + // User intent: send messages / calls to other people + "messaging" to setOf( + "messaging", "chat", "instant-messaging", "im", + "matrix", "xmpp", "email", "mail", "voip", "sip", + "sms", "telegram", "signal", "irc", "discord-alternative", + ), + + // User intent: browse the web + "browser" to setOf( + "browser", "web-browser", "firefox-fork", + ), + + // User intent: run services on own hardware + "self-hosted" to setOf( + "self-hosted", "self-hosting", "homeserver", "home-server", + "self-host", + ), + + // User intent: back up or sync files across devices + "backup" to setOf( + "backup", "sync", "synchronization", "file-sync", + "cloud-sync", "webdav", "nextcloud", "syncthing", + ), + + // User intent: interact with social / fediverse networks + "social" to setOf( + "social-network", "mastodon", "fediverse", "activitypub", + "bluesky", "twitter-alternative", "pleroma", "misskey", + "nostr", "lemmy", "pixelfed", + ), + + // User intent: customise Android home screen / input + "launcher" to setOf( + "launcher", "android-launcher", "home-screen", + ), + ) + + // Priority order: most distinctive for our FOSS audience first. + // A repo matching multiple codes shows the highest-priority ones. + private val PRIORITY_ORDER = listOf( + "ai", + "privacy", + "security", + "networking", + "messaging", + "browser", + "social", + "launcher", + "notes", + "reader", + "audio", + "video", + "photo", + "backup", + "self-hosted", + ) +}