From 13b4a8006d43266da91c943ca8750c7f0491c912 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Tue, 17 Mar 2026 17:04:10 +0100 Subject: [PATCH 01/12] add github source --- src/components/App/App.tsx | 2 + src/lib/sources/gitHubSource.ts | 262 ++++++++++++++++++++++++++ src/lib/sources/index.ts | 1 + test/lib/sources/gitHubSource.test.ts | 236 +++++++++++++++++++++++ 4 files changed, 501 insertions(+) create mode 100644 src/lib/sources/gitHubSource.ts create mode 100644 test/lib/sources/gitHubSource.test.ts diff --git a/src/components/App/App.tsx b/src/components/App/App.tsx index 3a539ad..35a3407 100644 --- a/src/components/App/App.tsx +++ b/src/components/App/App.tsx @@ -1,5 +1,6 @@ import { useMemo } from 'react' import { Config, ConfigProvider } from '../../hooks/useConfig.js' +import { getGitHubSource } from '../../lib/sources/gitHubSource.js' import { getHttpSource } from '../../lib/sources/httpSource.js' import { getHuggingFaceSource } from '../../lib/sources/huggingFaceSource.js' import { getHyperparamSource } from '../../lib/sources/hyperparamSource.js' @@ -12,6 +13,7 @@ export default function App() { const col = search.get('col') === null ? undefined : Number(search.get('col')) const source = getHuggingFaceSource(sourceId) ?? + getGitHubSource(sourceId) ?? getHttpSource(sourceId) ?? getHyperparamSource(sourceId, { endpoint: location.origin }) diff --git a/src/lib/sources/gitHubSource.ts b/src/lib/sources/gitHubSource.ts new file mode 100644 index 0000000..87e9024 --- /dev/null +++ b/src/lib/sources/gitHubSource.ts @@ -0,0 +1,262 @@ +import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js' +import { getFileName } from './utils.js' + +interface BaseUrl { + source: string + origin: string + repo: string + branch: string + path: string +} + +interface DirectoryUrl extends BaseUrl { + kind: 'directory' + action: 'tree' +} + +interface FileUrl extends BaseUrl { + kind: 'file' + action?: 'blob' | 'raw/refs/heads' + resolveUrl: string +} + +interface RawFileUrl extends BaseUrl { + kind: 'file' + action: undefined + resolveUrl: string +} + +type GHUrl = DirectoryUrl | FileUrl | RawFileUrl + +const baseUrl = 'https://github.com' +const baseRawUrl = 'https://raw.githubusercontent.com' + +function getSourceParts(url: GHUrl): SourcePart[] { + const sourceParts: SourcePart[] = [{ + sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/`, + text: `${baseUrl}/${url.repo}/tree/${url.branch}/`, + }] + + const pathParts = url.path.split('/').filter(d => d.length > 0) + const lastPart = pathParts.at(-1) + if (lastPart) { + for (const [i, part] of pathParts.slice(0, -1).entries()) { + sourceParts.push({ + sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/${pathParts.slice(0, i + 1).join('/')}`, + text: part + '/', + }) + } + sourceParts.push({ + sourceId: `${baseUrl}/${url.repo}/${url.action === 'tree' ? 'tree/' : 'blob/'}${url.branch}${url.path}`, + text: lastPart, + }) + } + return sourceParts +} +function getPrefix(url: DirectoryUrl): string { + return `${baseUrl}/${url.repo}/tree/${url.branch}${url.path}`.replace(/\/$/, '') +} +async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: RequestInit, accessToken?: string }): Promise { + const apiURL = `https://api.github.com/repos/${url.repo}/contents/${url.path}?ref=${url.branch}` + const headers: Record = { + 'Accept': 'application/vnd.github+json', + } + if (options?.accessToken) { + headers.Authorization = `token ${options.accessToken}` + } + const response = await fetch(apiURL, { + method: 'GET', + headers, + ...options?.requestInit, + }) + if (!response.ok) { + throw new Error(`GitHub API error: ${response.status} ${response.statusText} - ${await response.text()}`) + } + try { + const data: unknown = await response.json() + const isDirectory = Array.isArray(data) + if (!isDirectory) { + throw new Error('Not a directory') + } + const files: FileMetadata[] = [] + for (const file of data as unknown[]) { + if (typeof file !== 'object' || file === null || !('name' in file) || !('path' in file) || !('type' in file) || !('size' in file)) { + throw new Error('Invalid file metadata') + } + if (file.type !== 'file' && file.type !== 'dir') { + throw new Error('Unsupported file type') + } + if (typeof file.name !== 'string' || typeof file.path !== 'string' || typeof file.size !== 'number') { + throw new Error('Invalid file metadata types') + } + files.push({ + name: getFileName(file.path), + fileSize: file.size, + sourceId: `${url.origin}/${url.repo}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''), + kind: file.type === 'file' ? 'file' : 'directory', // 'unknown' is considered as a directory + }) + } + return files + } catch (error) { + throw new Error(`Failed to parse GitHub API response: ${error instanceof Error ? error.message : String(error)}`) + } +} +export function getGitHubSource(sourceId: string, options?: {requestInit?: RequestInit, accessToken?: string}): FileSource | DirSource | undefined { + try { + const url = parseGitHubUrl(sourceId) + // async function fetchVersions() { + // const refsList = await fetchRefsList(url, options) + // return { + // label: 'Branches', + // versions: refsList.map(({ refType, name, ref }) => { + // const label = refType === 'branches' ? name : + // refType === 'converts' ? `[convert] ${name}` : + // refType === 'tags' ? `[tag] ${name}` : + // `[pr] ${name}` + // // remove refs/heads/ from the ref name + // // e.g. refs/heads/main -> main + // const fixedRef = refType === 'branches' ? ref.replace(/refs\/heads\//, '') : ref + // const branchSourceId = `${url.origin}/${getFullName(url)}/${url.kind === 'file' ? 'blob' : 'tree'}/${fixedRef}${url.path}` + // return { + // label, + // sourceId: branchSourceId, + // } + // }), + // } + // } + if (url.kind === 'file') { + return { + kind: 'file', + sourceId, + sourceParts: getSourceParts(url), + fileName: getFileName(url.path), + resolveUrl: url.resolveUrl, + requestInit: options?.requestInit, + // fetchVersions, + } + } else { + return { + kind: 'directory', + sourceId, + sourceParts: getSourceParts(url), + prefix: getPrefix(url), + listFiles: () => fetchFilesList(url, options), + // fetchVersions, + } + } + } catch { + return undefined + } +} + +export function parseGitHubUrl(url: string): GHUrl { + const urlObject = new URL(url) + // ^ throws 'TypeError: URL constructor: {url} is not a valid URL.' if url is not a valid URL + + if ( + urlObject.protocol !== 'https:' || + ![ + 'github.co', 'github.com', 'www.github.com', 'raw.githubusercontent.com', + ].includes(urlObject.host) + ) { + throw new Error('Not a GitHub URL') + } + + const { pathname } = urlObject + + if (urlObject.host === 'raw.githubusercontent.com') { + // https://raw.githubusercontent.com/apache/parquet-testing/refs/heads/master/variant/README.md + const rawFileGroups = + /^\/(?[^/]+)\/(?[^/]+)\/(?(refs\/heads\/)?)(?[^/]+)(?(\/[^/]+)+)$/.exec( + pathname + )?.groups + if ( + rawFileGroups?.owner !== undefined && + rawFileGroups.repo !== undefined && + rawFileGroups.branch !== undefined && + rawFileGroups.path !== undefined + ) { + const branch = rawFileGroups.branch.replace(/\//g, '%2F') + const source = `${urlObject.origin}/${rawFileGroups.owner}/${rawFileGroups.repo}/${branch}${rawFileGroups.path}` + return { + kind: 'file', + source, + origin: urlObject.origin, + repo: rawFileGroups.owner + '/' + rawFileGroups.repo, + branch, + path: rawFileGroups.path, + resolveUrl: source, + } + } else { + throw new Error('Unsupported GitHub URL') + } + } + + const repoGroups = /^\/(?[^/]+)\/(?[^/]+)\/?$/.exec( + pathname + )?.groups + if (repoGroups?.owner !== undefined && repoGroups.repo !== undefined) { + return { + kind: 'directory', + source: url, + origin: urlObject.origin, + repo: repoGroups.owner + '/' + repoGroups.repo, + action: 'tree', + branch: 'main', // hardcode the default branch + path: '', + } + } + + const folderGroups = + /^\/(?[^/]+)\/(?[^/]+)\/(?tree)\/(?[^/]+)(?(\/[^/]+)*)\/?$/.exec( + pathname + )?.groups + if ( + folderGroups?.owner !== undefined && + folderGroups.repo !== undefined && + folderGroups.action !== undefined && + folderGroups.branch !== undefined && + folderGroups.path !== undefined + ) { + const branch = folderGroups.branch.replace(/\//g, '%2F') + const source = `${urlObject.origin}/${folderGroups.owner}/${folderGroups.repo}/${folderGroups.action}/${branch}${folderGroups.path}` + return { + kind: 'directory', + source, + origin: urlObject.origin, + repo: folderGroups.owner + '/' + folderGroups.repo, + action: 'tree', + branch, + path: folderGroups.path, + } + } + + // https://github.com/apache/parquet-testing/blob/master/variant/README.md + // https://github.com/apache/parquet-testing/raw/refs/heads/master/variant/README.md + const fileGroups = + /^\/(?[^/]+)\/(?[^/]+)\/(?blob|refs\/heads|raw\/refs\/heads)\/(?[^/]+)(?(\/[^/]+)+)$/.exec( + pathname + )?.groups + if ( + fileGroups?.owner !== undefined && + fileGroups.repo !== undefined && + fileGroups.action !== undefined && + fileGroups.branch !== undefined && + fileGroups.path !== undefined + ) { + const branch = fileGroups.branch.replace(/\//g, '%2F') + const source = `${urlObject.origin}/${fileGroups.owner}/${fileGroups.repo}/${fileGroups.action}/${branch}${fileGroups.path}` + return { + kind: 'file', + source, + origin: urlObject.origin, + repo: fileGroups.owner + '/' + fileGroups.repo, + action: fileGroups.action === 'blob' ? 'blob' : 'raw/refs/heads', + branch, + path: fileGroups.path, + resolveUrl: `${baseRawUrl}/${fileGroups.owner}/${fileGroups.repo}/refs/heads/${branch}${fileGroups.path}`, + } + } + + throw new Error('Unsupported GitHub URL') +} diff --git a/src/lib/sources/index.ts b/src/lib/sources/index.ts index 648af03..155d5a2 100644 --- a/src/lib/sources/index.ts +++ b/src/lib/sources/index.ts @@ -1,6 +1,7 @@ export { getHttpSource } from './httpSource.js' export { getHyperparamSource } from './hyperparamSource.js' export { getHuggingFaceSource } from './huggingFaceSource.js' +export { getGitHubSource } from './gitHubSource.js' export type { HyperparamFileMetadata } from './hyperparamSource.js' export type { DirSource, FileKind, FileMetadata, FileSource, Source, SourcePart } from './types.js' export { getFileName } from './utils.js' diff --git a/test/lib/sources/gitHubSource.test.ts b/test/lib/sources/gitHubSource.test.ts new file mode 100644 index 0000000..ab71a59 --- /dev/null +++ b/test/lib/sources/gitHubSource.test.ts @@ -0,0 +1,236 @@ +import { describe, expect, it, test } from 'vitest' +import { getGitHubSource, parseGitHubUrl } from '../../../src/lib/sources/gitHubSource.js' + +describe('parseGitHubUrl', () => { + test.for([ + 'github.co', + 'github.com', + 'www.github.com', + ])('accepts domain: %s', (domain) => { + const origin = `https://${domain}` + const url = `${origin}/owner/repo` + expect(parseGitHubUrl(url)).toEqual({ + kind: 'directory', + origin, + repo: 'owner/repo', + source: url, + action: 'tree', + branch: 'main', + path: '', + }) + }) + + it('throws for unsupported scheme or domain', () => { + expect(() => parseGitHubUrl('ftp://github.com/owner/repo')).toThrow() + expect(() => parseGitHubUrl('email://github.com/owner/repo')).toThrow() + expect(() => parseGitHubUrl('http://github.com/owner/repo')).toThrow() + expect(() => parseGitHubUrl('https://hf.com/owner/repo')).toThrow() + expect(() => parseGitHubUrl('https://huggingface.co/owner/repo')).toThrow() + expect(() => parseGitHubUrl('github.com/owner/repo')).toThrow() + }) + + test.for([ + '', + '/', + // for the following tests, the same is true with a trailing slash + // Avoiding for brevity. + '/owner', + '/owner/repo/branch', + '/owner/repo/tree', + '/owner/repo/blob', + '/owner/repo/blob/branch', + // note the trailing slash + '/owner/repo/blob/branch/file/', + ])('throws for invalid path: %s', (path) => { + expect(() => parseGitHubUrl(`https://github.com${path}`)).to.throw() + }) + + test.for([ + // Root directory + [ + 'https://github.com/owner/repo', + 'https://github.com/owner/repo', + 'owner/repo', + 'main', + '', + ], + [ + 'https://github.com/owner/repo/', + 'https://github.com/owner/repo/', + 'owner/repo', + 'main', + '', + ], + // all-number identifier is not a valid GitHub repo name, but we accept any string + [ + 'https://github.com/owner/123', + 'https://github.com/owner/123', + 'owner/123', + 'main', + '', + ], + // Branches + [ + 'https://github.com/owner/repo/tree/branch', + 'https://github.com/owner/repo/tree/branch', + 'owner/repo', + 'branch', + '', + ], + [ + 'https://github.com/owner/repo/tree/branch/', + 'https://github.com/owner/repo/tree/branch', + 'owner/repo', + 'branch', + '', + ], + // Subdirectories + [ + 'https://github.com/owner/repo/tree/branch/folder', + 'https://github.com/owner/repo/tree/branch/folder', + 'owner/repo', + 'branch', + '/folder', + ], + [ + 'https://github.com/owner/repo/tree/branch/a/b/c/', + 'https://github.com/owner/repo/tree/branch/a/b/c', + 'owner/repo', + 'branch', + '/a/b/c', + ], + // A subdirectory can have a dot in its name (what matters is 'tree' vs 'blob') + [ + 'https://github.com/owner/repo/tree/branch/folder.parquet', + 'https://github.com/owner/repo/tree/branch/folder.parquet', + 'owner/repo', + 'branch', + '/folder.parquet', + ], + ])( + 'parses a DirectoryUrl for root or subdirectory: %s', + ([url, source, repo, branch, path]) => { + expect(parseGitHubUrl(url)).toEqual({ + kind: 'directory', + origin, + repo, + source, + action: 'tree', + branch, + path, + }) + } + ) + + const origin = 'https://github.com' + const branch = 'branch' + const repo = 'owner/repo' + const path = '/path/to/file.parquet' + it('parses a FileUrl for file URL', () => { + const url = `https://github.com/${repo}/blob/${branch}${path}` + const resolveUrl = `https://raw.githubusercontent.com/${repo}/refs/heads/${branch}${path}` + expect(parseGitHubUrl(url)).toEqual({ + kind: 'file', + origin, + repo, + source: url, + action: 'blob', + branch, + path, + resolveUrl, + }) + } + ) +}) + +describe('getGitHubSource', () => { + describe('source parts', () => { + it('returns the URL for a repository URL', () => { + const url = 'https://github.com/owner/repo' + expect(getGitHubSource(url)?.sourceParts).toEqual([{ + sourceId: 'https://github.com/owner/repo/tree/main/', + text: 'https://github.com/owner/repo/tree/main/', + }]) + }) + it('returns the URL for a branch root URL', () => { + const url = 'https://github.com/owner/repo/tree/branch' + expect(getGitHubSource(url)?.sourceParts).toEqual([{ + sourceId: 'https://github.com/owner/repo/tree/branch/', + text: 'https://github.com/owner/repo/tree/branch/', + }]) + }) + it('returns the URL then every parent directory for a branch subdirectory URL', () => { + const url = 'https://github.com/owner/repo/tree/branch/a/b/c' + expect(getGitHubSource(url)?.sourceParts).toEqual([ + { + sourceId: 'https://github.com/owner/repo/tree/branch/', + text: 'https://github.com/owner/repo/tree/branch/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a', + text: 'a/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a/b', + text: 'b/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a/b/c', + text: 'c', + }, + ]) + }) + it('returns the URL then every parent directory then the blob URL for a file URL', () => { + const url = 'https://github.com/owner/repo/blob/branch/a/b/c/file.parquet' + expect(getGitHubSource(url)?.sourceParts).toEqual([ + { + sourceId: 'https://github.com/owner/repo/tree/branch/', + text: 'https://github.com/owner/repo/tree/branch/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a', + text: 'a/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a/b', + text: 'b/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a/b/c', + text: 'c/', + }, + { + sourceId: 'https://github.com/owner/repo/blob/branch/a/b/c/file.parquet', + text: 'file.parquet', + }, + ]) + }) + test.for([ + 'https://raw.githubusercontent.com/owner/repo/branch/a/b/c/file.parquet', + 'https://raw.githubusercontent.com/owner/repo/refs/heads/branch/a/b/c/file.parquet', + ])('returns github.com parts for a raw URL', (url) => { + expect(getGitHubSource(url)?.sourceParts).toEqual([ + { + sourceId: 'https://github.com/owner/repo/tree/branch/', + text: 'https://github.com/owner/repo/tree/branch/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a', + text: 'a/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a/b', + text: 'b/', + }, + { + sourceId: 'https://github.com/owner/repo/tree/branch/a/b/c', + text: 'c/', + }, + { + sourceId: 'https://github.com/owner/repo/blob/branch/a/b/c/file.parquet', + text: 'file.parquet', + }, + ]) + }) + }) +}) From 309cc7ce7c3bafe018d43aba58e99e7cc7494d26 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Tue, 17 Mar 2026 17:13:29 +0100 Subject: [PATCH 02/12] fetch branches --- src/lib/sources/gitHubSource.ts | 66 ++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/src/lib/sources/gitHubSource.ts b/src/lib/sources/gitHubSource.ts index 87e9024..f0f3d47 100644 --- a/src/lib/sources/gitHubSource.ts +++ b/src/lib/sources/gitHubSource.ts @@ -104,26 +104,19 @@ async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: Reque export function getGitHubSource(sourceId: string, options?: {requestInit?: RequestInit, accessToken?: string}): FileSource | DirSource | undefined { try { const url = parseGitHubUrl(sourceId) - // async function fetchVersions() { - // const refsList = await fetchRefsList(url, options) - // return { - // label: 'Branches', - // versions: refsList.map(({ refType, name, ref }) => { - // const label = refType === 'branches' ? name : - // refType === 'converts' ? `[convert] ${name}` : - // refType === 'tags' ? `[tag] ${name}` : - // `[pr] ${name}` - // // remove refs/heads/ from the ref name - // // e.g. refs/heads/main -> main - // const fixedRef = refType === 'branches' ? ref.replace(/refs\/heads\//, '') : ref - // const branchSourceId = `${url.origin}/${getFullName(url)}/${url.kind === 'file' ? 'blob' : 'tree'}/${fixedRef}${url.path}` - // return { - // label, - // sourceId: branchSourceId, - // } - // }), - // } - // } + async function fetchVersions() { + const branches = await fetchBranchesList(url, options) + return { + label: 'Branches', + versions: branches.map((branch) => { + const branchSourceId = `${url.origin}/${url.repo}/${url.kind === 'file' ? 'blob' : 'tree'}/${branch}${url.path}` + return { + label: branch, + sourceId: branchSourceId, + } + }), + } + } if (url.kind === 'file') { return { kind: 'file', @@ -132,7 +125,7 @@ export function getGitHubSource(sourceId: string, options?: {requestInit?: Reque fileName: getFileName(url.path), resolveUrl: url.resolveUrl, requestInit: options?.requestInit, - // fetchVersions, + fetchVersions, } } else { return { @@ -141,7 +134,7 @@ export function getGitHubSource(sourceId: string, options?: {requestInit?: Reque sourceParts: getSourceParts(url), prefix: getPrefix(url), listFiles: () => fetchFilesList(url, options), - // fetchVersions, + fetchVersions, } } } catch { @@ -260,3 +253,32 @@ export function parseGitHubUrl(url: string): GHUrl { throw new Error('Unsupported GitHub URL') } + +/** + * List branches in a GitHub dataset repo + * + * Example API URL: https://api.github.com/repos/owner/repo/branches + * + * @param repo (namespace/repo) + * @param [options] + * @param [options.requestInit] - request init object to pass to fetch + * @param [options.accessToken] - access token to use for authentication + * + * @returns the list of branch names + */ +async function fetchBranchesList( + url: GHUrl, + options?: {requestInit?: RequestInit, accessToken?: string} +): Promise { + const headers = new Headers(options?.requestInit?.headers) + headers.set('accept', 'application/vnd.github+json') + if (options?.accessToken) { + headers.set('Authorization', `Bearer ${options.accessToken}`) + } + const response = await fetch(`https://api.github.com/repos/${url.repo}/branches`, { ...options?.requestInit, headers }) + if (!response.ok) { + throw new Error(`HTTP error ${response.status.toString()}`) + } + const branches = await response.json() as {name: string}[] + return branches.map(({ name }) => name) +} From f604353138aa3c9b2e964fd297332bf8b7805cf1 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Tue, 17 Mar 2026 12:19:27 -0400 Subject: [PATCH 03/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/lib/sources/gitHubSource.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/sources/gitHubSource.ts b/src/lib/sources/gitHubSource.ts index f0f3d47..32be035 100644 --- a/src/lib/sources/gitHubSource.ts +++ b/src/lib/sources/gitHubSource.ts @@ -62,7 +62,7 @@ async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: Reque 'Accept': 'application/vnd.github+json', } if (options?.accessToken) { - headers.Authorization = `token ${options.accessToken}` + headers.Authorization = `Bearer ${options.accessToken}` } const response = await fetch(apiURL, { method: 'GET', From 380d8854f5f4c71f621117bca61702902c44155d Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Tue, 17 Mar 2026 12:21:10 -0400 Subject: [PATCH 04/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/lib/sources/gitHubSource.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/sources/gitHubSource.ts b/src/lib/sources/gitHubSource.ts index 32be035..830d90a 100644 --- a/src/lib/sources/gitHubSource.ts +++ b/src/lib/sources/gitHubSource.ts @@ -109,7 +109,7 @@ export function getGitHubSource(sourceId: string, options?: {requestInit?: Reque return { label: 'Branches', versions: branches.map((branch) => { - const branchSourceId = `${url.origin}/${url.repo}/${url.kind === 'file' ? 'blob' : 'tree'}/${branch}${url.path}` + const branchSourceId = `${baseUrl}/${url.repo}/${url.kind === 'file' ? 'blob' : 'tree'}/${branch}${url.path}` return { label: branch, sourceId: branchSourceId, From c26014cdbb3ca768b05460b1f47f7c34c38b3786 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Tue, 17 Mar 2026 17:25:15 +0100 Subject: [PATCH 05/12] copilot comments --- src/lib/sources/gitHubSource.ts | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/lib/sources/gitHubSource.ts b/src/lib/sources/gitHubSource.ts index 830d90a..d9b8fad 100644 --- a/src/lib/sources/gitHubSource.ts +++ b/src/lib/sources/gitHubSource.ts @@ -57,17 +57,16 @@ function getPrefix(url: DirectoryUrl): string { return `${baseUrl}/${url.repo}/tree/${url.branch}${url.path}`.replace(/\/$/, '') } async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: RequestInit, accessToken?: string }): Promise { - const apiURL = `https://api.github.com/repos/${url.repo}/contents/${url.path}?ref=${url.branch}` - const headers: Record = { - 'Accept': 'application/vnd.github+json', - } + const apiURL = `https://api.github.com/repos/${url.repo}/contents${url.path}?ref=${url.branch}` + const headers = new Headers(options?.requestInit?.headers) + headers.set('Accept', 'application/vnd.github+json') if (options?.accessToken) { - headers.Authorization = `Bearer ${options.accessToken}` + headers.set('Authorization', `Bearer ${options.accessToken}`) } const response = await fetch(apiURL, { + ...options?.requestInit, method: 'GET', headers, - ...options?.requestInit, }) if (!response.ok) { throw new Error(`GitHub API error: ${response.status} ${response.statusText} - ${await response.text()}`) From 7895eb36326c3bf8155aaf581da60bbdceee6311 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Tue, 17 Mar 2026 17:27:00 +0100 Subject: [PATCH 06/12] support /raw/ --- src/lib/sources/gitHubSource.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib/sources/gitHubSource.ts b/src/lib/sources/gitHubSource.ts index d9b8fad..2a3f669 100644 --- a/src/lib/sources/gitHubSource.ts +++ b/src/lib/sources/gitHubSource.ts @@ -16,7 +16,7 @@ interface DirectoryUrl extends BaseUrl { interface FileUrl extends BaseUrl { kind: 'file' - action?: 'blob' | 'raw/refs/heads' + action?: 'blob' | 'raw' | 'raw/refs/heads' resolveUrl: string } @@ -226,7 +226,7 @@ export function parseGitHubUrl(url: string): GHUrl { // https://github.com/apache/parquet-testing/blob/master/variant/README.md // https://github.com/apache/parquet-testing/raw/refs/heads/master/variant/README.md const fileGroups = - /^\/(?[^/]+)\/(?[^/]+)\/(?blob|refs\/heads|raw\/refs\/heads)\/(?[^/]+)(?(\/[^/]+)+)$/.exec( + /^\/(?[^/]+)\/(?[^/]+)\/(?blob|raw|raw\/refs\/heads)\/(?[^/]+)(?(\/[^/]+)+)$/.exec( pathname )?.groups if ( @@ -243,10 +243,10 @@ export function parseGitHubUrl(url: string): GHUrl { source, origin: urlObject.origin, repo: fileGroups.owner + '/' + fileGroups.repo, - action: fileGroups.action === 'blob' ? 'blob' : 'raw/refs/heads', + action: fileGroups.action === 'blob' ? 'blob' : fileGroups.action === 'raw' ? 'raw' : 'raw/refs/heads', branch, path: fileGroups.path, - resolveUrl: `${baseRawUrl}/${fileGroups.owner}/${fileGroups.repo}/refs/heads/${branch}${fileGroups.path}`, + resolveUrl: `${baseRawUrl}/${fileGroups.owner}/${fileGroups.repo}/${branch}${fileGroups.path}`, } } From 27b95e9bbdabe29854ad0927abd786e14f2296a5 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 18 Mar 2026 11:19:07 +0100 Subject: [PATCH 07/12] fix test --- test/lib/sources/gitHubSource.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lib/sources/gitHubSource.test.ts b/test/lib/sources/gitHubSource.test.ts index ab71a59..64f77f1 100644 --- a/test/lib/sources/gitHubSource.test.ts +++ b/test/lib/sources/gitHubSource.test.ts @@ -128,7 +128,7 @@ describe('parseGitHubUrl', () => { const path = '/path/to/file.parquet' it('parses a FileUrl for file URL', () => { const url = `https://github.com/${repo}/blob/${branch}${path}` - const resolveUrl = `https://raw.githubusercontent.com/${repo}/refs/heads/${branch}${path}` + const resolveUrl = `https://raw.githubusercontent.com/${repo}/${branch}${path}` expect(parseGitHubUrl(url)).toEqual({ kind: 'file', origin, From 51faf342d2e1340e5dadbf93c3a83a5b7aee6fd8 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 18 Mar 2026 11:23:13 +0100 Subject: [PATCH 08/12] assume the response has the correct type --- src/lib/sources/gitHubSource.ts | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/src/lib/sources/gitHubSource.ts b/src/lib/sources/gitHubSource.ts index 2a3f669..7c69634 100644 --- a/src/lib/sources/gitHubSource.ts +++ b/src/lib/sources/gitHubSource.ts @@ -72,30 +72,13 @@ async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: Reque throw new Error(`GitHub API error: ${response.status} ${response.statusText} - ${await response.text()}`) } try { - const data: unknown = await response.json() - const isDirectory = Array.isArray(data) - if (!isDirectory) { - throw new Error('Not a directory') - } - const files: FileMetadata[] = [] - for (const file of data as unknown[]) { - if (typeof file !== 'object' || file === null || !('name' in file) || !('path' in file) || !('type' in file) || !('size' in file)) { - throw new Error('Invalid file metadata') - } - if (file.type !== 'file' && file.type !== 'dir') { - throw new Error('Unsupported file type') - } - if (typeof file.name !== 'string' || typeof file.path !== 'string' || typeof file.size !== 'number') { - throw new Error('Invalid file metadata types') - } - files.push({ - name: getFileName(file.path), - fileSize: file.size, - sourceId: `${url.origin}/${url.repo}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''), - kind: file.type === 'file' ? 'file' : 'directory', // 'unknown' is considered as a directory - }) - } - return files + const data = await response.json() as {name: string, path: string, type: 'file' | 'dir', size: number}[] + return data.map((file) => ({ + name: getFileName(file.path), + fileSize: file.size, + sourceId: `${url.origin}/${url.repo}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''), + kind: file.type === 'file' ? 'file' : 'directory', + })) } catch (error) { throw new Error(`Failed to parse GitHub API response: ${error instanceof Error ? error.message : String(error)}`) } From b3643462d754a1cf970eb2a289f8ee6dffa57d5e Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 18 Mar 2026 11:25:10 +0100 Subject: [PATCH 09/12] add a TODO --- src/lib/sources/gitHubSource.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib/sources/gitHubSource.ts b/src/lib/sources/gitHubSource.ts index 7c69634..110628f 100644 --- a/src/lib/sources/gitHubSource.ts +++ b/src/lib/sources/gitHubSource.ts @@ -1,6 +1,8 @@ import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js' import { getFileName } from './utils.js' +// TODO(SL): support branches with slashes in their names (feature/foo) + interface BaseUrl { source: string origin: string From d1fce01b70fec804a435ea9cf7ff188c38881943 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 18 Mar 2026 15:05:13 +0100 Subject: [PATCH 10/12] remove concept of directory prefix, remove default branch in github source --- src/components/Folder/Folder.test.tsx | 3 -- src/components/Folder/Folder.tsx | 16 ++++------ src/lib/sources/gitHubSource.ts | 44 ++++++++++++++++----------- src/lib/sources/httpSource.ts | 1 - src/lib/sources/huggingFaceSource.ts | 4 --- src/lib/sources/hyperparamSource.ts | 1 - src/lib/sources/types.ts | 1 - test/lib/sources/gitHubSource.test.ts | 32 ++----------------- 8 files changed, 36 insertions(+), 66 deletions(-) diff --git a/src/components/Folder/Folder.test.tsx b/src/components/Folder/Folder.test.tsx index 1084203..c75a416 100644 --- a/src/components/Folder/Folder.test.tsx +++ b/src/components/Folder/Folder.test.tsx @@ -95,7 +95,6 @@ describe('Folder Component', () => { sourceId: 'test-source', sourceParts: [{ text: 'test-source', sourceId: 'test-source' }], kind: 'directory', - prefix: '', listFiles: () => Promise.resolve(mockFiles), } const { getByPlaceholderText, findByText, getByText, queryByText } = render() @@ -133,7 +132,6 @@ describe('Folder Component', () => { sourceId: 'test-source', sourceParts: [{ text: 'test-source', sourceId: 'test-source' }], kind: 'directory', - prefix: '', listFiles: () => Promise.resolve(mockFiles), } const { getByPlaceholderText, findByText } = render() @@ -153,7 +151,6 @@ describe('Folder Component', () => { sourceId: 'test-source', sourceParts: [{ text: 'test-source', sourceId: 'test-source' }], kind: 'directory', - prefix: '', listFiles: async () => { await fetch('something') // to ensure we wait for loading return [] diff --git a/src/components/Folder/Folder.tsx b/src/components/Folder/Folder.tsx index 3919a4a..b3ebd61 100644 --- a/src/components/Folder/Folder.tsx +++ b/src/components/Folder/Folder.tsx @@ -63,15 +63,15 @@ export default function Folder({ source }: FolderProps) { } else if (e.key === 'Enter') { // if there is only one result, view it if (filtered?.length === 1 && 0 in filtered) { - const key = join(source.prefix, filtered[0].name) - if (key.endsWith('/')) { + const file = filtered[0] + if (file.kind === 'directory') { // clear search because we're about to change folder if (searchRef.current) { searchRef.current.value = '' } setSearchQuery('') } - location.href = `/files?key=${key}` + location.href = routes?.getSourceRouteUrl?.({ sourceId: file.sourceId }) ?? `/files?key=${file.sourceId}` } } else if (e.key === 'ArrowDown') { // move focus to first list item @@ -81,7 +81,7 @@ export default function Folder({ source }: FolderProps) { searchElement?.addEventListener('keyup', handleKeyup) // Clean up event listener return () => searchElement?.removeEventListener('keyup', handleKeyup) - }, [filtered, source.prefix]) + }, [filtered, routes]) // Jump to search box if user types '/' useEffect(() => { @@ -97,7 +97,7 @@ export default function Folder({ source }: FolderProps) { return () => { document.removeEventListener('keydown', handleKeydown) } }, []) - return + return @@ -114,7 +114,7 @@ export default function Folder({ source }: FolderProps) {