Skip to content

Commit 74d0a47

Browse files
authored
fix(trigger): fix Google Sheets trigger header detection and row index tracking (#4109)
* fix(trigger): auto-detect header row and rename lastKnownRowCount to lastIndexChecked - Replace hardcoded !1:1 header fetch with detectHeaderRow(), which scans the first 10 rows and returns the first non-empty row as headers. This fixes row: null / headers: [] when a sheet has blank rows or a title row above the actual column headers (e.g. headers in row 3). - Rename lastKnownRowCount → lastIndexChecked in GoogleSheetsWebhookConfig and all usage sites to clarify that the value is a row index pointer, not a total count. - Remove config parameter from processRows() since it was unused after the includeHeaders flag was removed. * fix(trigger): combine sheet state fetch, skip header/blank rows from data emission - Replace separate getDataRowCount() + detectHeaderRow() with a single fetchSheetState() call that returns rowCount, headers, and headerRowIndex from one A:Z fetch. Saves one Sheets API round-trip per poll cycle when new rows are detected. - Use headerRowIndex to compute adjustedStartRow, preventing the header row (and any blank rows above it) from being emitted as data events when lastIndexChecked was seeded from an empty sheet. - Handle the edge case where the entire batch falls within the header/blank window by advancing the pointer and returning early without fetching rows. - Skip empty rows (row.length === 0) in processRows rather than firing a workflow run with no meaningful data. * fix(trigger): preserve lastModifiedTime when remaining rows exist after header skip When all rows in a batch fall within the header/blank window (adjustedStartRow > endRow), the early return was unconditionally updating lastModifiedTime to the current value. If there were additional rows beyond the batch cap, the next Drive pre-check would see an unchanged modifiedTime and skip polling entirely, leaving those rows unprocessed. Mirror the hasRemainingOrFailed pattern from the normal processing path. * chore(trigger): remove verbose inline comments from google-sheets poller * fix(trigger): revert to full-width A:Z fetch for correct row count and consistent column scope * fix(trigger): don't count skipped empty rows as processed
1 parent c852585 commit 74d0a47

File tree

1 file changed

+94
-90
lines changed

1 file changed

+94
-90
lines changed

apps/sim/lib/webhooks/polling/google-sheets.ts

Lines changed: 94 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ import { processPolledWebhookEvent } from '@/lib/webhooks/processor'
1010

1111
const MAX_ROWS_PER_POLL = 100
1212

13+
/** Maximum number of leading rows to scan when auto-detecting the header row. */
14+
const HEADER_SCAN_ROWS = 10
15+
1316
type ValueRenderOption = 'FORMATTED_VALUE' | 'UNFORMATTED_VALUE' | 'FORMULA'
1417
type DateTimeRenderOption = 'SERIAL_NUMBER' | 'FORMATTED_STRING'
1518

@@ -20,7 +23,8 @@ interface GoogleSheetsWebhookConfig {
2023
manualSheetName?: string
2124
valueRenderOption?: ValueRenderOption
2225
dateTimeRenderOption?: DateTimeRenderOption
23-
lastKnownRowCount?: number
26+
/** 1-indexed row number of the last row seeded or processed. */
27+
lastIndexChecked?: number
2428
lastModifiedTime?: string
2529
lastCheckedTimestamp?: string
2630
maxRowsPerPoll?: number
@@ -63,7 +67,6 @@ export const googleSheetsPollingHandler: PollingProviderHandler = {
6367
return 'failure'
6468
}
6569

66-
// Pre-check: use Drive API to see if the file was modified since last poll
6770
const { unchanged: skipPoll, currentModifiedTime } = await isDriveFileUnchanged(
6871
accessToken,
6972
spreadsheetId,
@@ -83,44 +86,51 @@ export const googleSheetsPollingHandler: PollingProviderHandler = {
8386
return 'success'
8487
}
8588

86-
// Fetch current row count via column A
87-
const currentRowCount = await getDataRowCount(
89+
const valueRender = config.valueRenderOption || 'FORMATTED_VALUE'
90+
const dateTimeRender = config.dateTimeRenderOption || 'SERIAL_NUMBER'
91+
92+
const {
93+
rowCount: currentRowCount,
94+
headers,
95+
headerRowIndex,
96+
} = await fetchSheetState(
8897
accessToken,
8998
spreadsheetId,
9099
sheetName,
100+
valueRender,
101+
dateTimeRender,
91102
requestId,
92103
logger
93104
)
94105

95106
// First poll: seed state, emit nothing
96-
if (config.lastKnownRowCount === undefined) {
107+
if (config.lastIndexChecked === undefined) {
97108
await updateWebhookProviderConfig(
98109
webhookId,
99110
{
100-
lastKnownRowCount: currentRowCount,
111+
lastIndexChecked: currentRowCount,
101112
lastModifiedTime: currentModifiedTime ?? config.lastModifiedTime,
102113
lastCheckedTimestamp: now.toISOString(),
103114
},
104115
logger
105116
)
106117
await markWebhookSuccess(webhookId, logger)
107118
logger.info(
108-
`[${requestId}] First poll for webhook ${webhookId}, seeded row count: ${currentRowCount}`
119+
`[${requestId}] First poll for webhook ${webhookId}, seeded row index: ${currentRowCount}`
109120
)
110121
return 'success'
111122
}
112123

113-
// Rows deleted or unchanged
114-
if (currentRowCount <= config.lastKnownRowCount) {
115-
if (currentRowCount < config.lastKnownRowCount) {
124+
if (currentRowCount <= config.lastIndexChecked) {
125+
if (currentRowCount < config.lastIndexChecked) {
116126
logger.warn(
117-
`[${requestId}] Row count decreased from ${config.lastKnownRowCount} to ${currentRowCount} for webhook ${webhookId}`
127+
`[${requestId}] Row count decreased from ${config.lastIndexChecked} to ${currentRowCount} for webhook ${webhookId}`
118128
)
119129
}
120130
await updateWebhookProviderConfig(
121131
webhookId,
122132
{
123-
lastKnownRowCount: currentRowCount,
133+
lastIndexChecked: currentRowCount,
124134
lastModifiedTime: currentModifiedTime ?? config.lastModifiedTime,
125135
lastCheckedTimestamp: now.toISOString(),
126136
},
@@ -131,38 +141,47 @@ export const googleSheetsPollingHandler: PollingProviderHandler = {
131141
return 'success'
132142
}
133143

134-
// New rows detected
135-
const newRowCount = currentRowCount - config.lastKnownRowCount
144+
const newRowCount = currentRowCount - config.lastIndexChecked
136145
const maxRows = config.maxRowsPerPoll || MAX_ROWS_PER_POLL
137146
const rowsToFetch = Math.min(newRowCount, maxRows)
138-
const startRow = config.lastKnownRowCount + 1
139-
const endRow = config.lastKnownRowCount + rowsToFetch
147+
const startRow = config.lastIndexChecked + 1
148+
const endRow = config.lastIndexChecked + rowsToFetch
149+
150+
// Skip past the header row (and any blank rows above it) so it is never
151+
// emitted as a data event.
152+
const adjustedStartRow =
153+
headerRowIndex > 0 ? Math.max(startRow, headerRowIndex + 1) : startRow
140154

141155
logger.info(
142-
`[${requestId}] Found ${newRowCount} new rows for webhook ${webhookId}, processing rows ${startRow}-${endRow}`
156+
`[${requestId}] Found ${newRowCount} new rows for webhook ${webhookId}, processing rows ${adjustedStartRow}-${endRow}`
143157
)
144158

145-
// Resolve render options
146-
const valueRender = config.valueRenderOption || 'FORMATTED_VALUE'
147-
const dateTimeRender = config.dateTimeRenderOption || 'SERIAL_NUMBER'
148-
149-
const headers = await fetchHeaderRow(
150-
accessToken,
151-
spreadsheetId,
152-
sheetName,
153-
valueRender,
154-
dateTimeRender,
155-
requestId,
156-
logger
157-
)
159+
// Entire batch is header/blank rows — advance pointer and skip fetch.
160+
if (adjustedStartRow > endRow) {
161+
const hasRemainingRows = rowsToFetch < newRowCount
162+
await updateWebhookProviderConfig(
163+
webhookId,
164+
{
165+
lastIndexChecked: config.lastIndexChecked + rowsToFetch,
166+
lastModifiedTime: hasRemainingRows
167+
? config.lastModifiedTime
168+
: (currentModifiedTime ?? config.lastModifiedTime),
169+
lastCheckedTimestamp: now.toISOString(),
170+
},
171+
logger
172+
)
173+
await markWebhookSuccess(webhookId, logger)
174+
logger.info(
175+
`[${requestId}] Batch ${startRow}-${endRow} contained only header/blank rows for webhook ${webhookId}, advancing pointer`
176+
)
177+
return 'success'
178+
}
158179

159-
// Fetch new rows — startRow/endRow are already 1-indexed sheet row numbers
160-
// because lastKnownRowCount includes the header row
161180
const newRows = await fetchRowRange(
162181
accessToken,
163182
spreadsheetId,
164183
sheetName,
165-
startRow,
184+
adjustedStartRow,
166185
endRow,
167186
valueRender,
168187
dateTimeRender,
@@ -173,23 +192,22 @@ export const googleSheetsPollingHandler: PollingProviderHandler = {
173192
const { processedCount, failedCount } = await processRows(
174193
newRows,
175194
headers,
176-
startRow,
195+
adjustedStartRow,
177196
spreadsheetId,
178197
sheetName,
179-
config,
180198
webhookData,
181199
workflowData,
182200
requestId,
183201
logger
184202
)
185203

186204
const rowsAdvanced = failedCount > 0 ? 0 : rowsToFetch
187-
const newLastKnownRowCount = config.lastKnownRowCount + rowsAdvanced
205+
const newLastIndexChecked = config.lastIndexChecked + rowsAdvanced
188206
const hasRemainingOrFailed = rowsAdvanced < newRowCount
189207
await updateWebhookProviderConfig(
190208
webhookId,
191209
{
192-
lastKnownRowCount: newLastKnownRowCount,
210+
lastIndexChecked: newLastIndexChecked,
193211
lastModifiedTime: hasRemainingOrFailed
194212
? config.lastModifiedTime
195213
: (currentModifiedTime ?? config.lastModifiedTime),
@@ -256,20 +274,32 @@ async function getDriveFileModifiedTime(
256274
}
257275
}
258276

259-
async function getDataRowCount(
277+
/**
278+
* Fetches the sheet (A:Z) and returns the row count, auto-detected headers,
279+
* and the 1-indexed header row number in a single API call.
280+
*
281+
* The Sheets API omits trailing empty rows, so `rows.length` equals the last
282+
* non-empty row in columns A–Z. Header detection scans the first
283+
* {@link HEADER_SCAN_ROWS} rows for the first non-empty row. Returns
284+
* `headerRowIndex = 0` when no header is found within the scan window.
285+
*/
286+
async function fetchSheetState(
260287
accessToken: string,
261288
spreadsheetId: string,
262289
sheetName: string,
290+
valueRenderOption: ValueRenderOption,
291+
dateTimeRenderOption: DateTimeRenderOption,
263292
requestId: string,
264293
logger: ReturnType<typeof import('@sim/logger').createLogger>
265-
): Promise<number> {
294+
): Promise<{ rowCount: number; headers: string[]; headerRowIndex: number }> {
266295
const encodedSheet = encodeURIComponent(sheetName)
267-
// Fetch all rows across columns A–Z with majorDimension=ROWS so the API
268-
// returns one entry per row that has ANY non-empty cell. Rows where column A
269-
// is empty but other columns have data are included, whereas the previous
270-
// column-A-only approach silently missed them. The returned array length
271-
// equals the 1-indexed row number of the last row with data.
272-
const url = `https://sheets.googleapis.com/v4/spreadsheets/${spreadsheetId}/values/${encodedSheet}!A:Z?majorDimension=ROWS&fields=values`
296+
const params = new URLSearchParams({
297+
majorDimension: 'ROWS',
298+
fields: 'values',
299+
valueRenderOption,
300+
dateTimeRenderOption,
301+
})
302+
const url = `https://sheets.googleapis.com/v4/spreadsheets/${spreadsheetId}/values/${encodedSheet}!A:Z?${params.toString()}`
273303

274304
const response = await fetch(url, {
275305
headers: { Authorization: `Bearer ${accessToken}` },
@@ -278,61 +308,32 @@ async function getDataRowCount(
278308
if (!response.ok) {
279309
const status = response.status
280310
const errorData = await response.json().catch(() => ({}))
281-
282311
if (status === 403 || status === 429) {
283312
throw new Error(
284313
`Sheets API rate limit (${status}) — skipping to retry next poll cycle: ${JSON.stringify(errorData)}`
285314
)
286315
}
287-
288316
throw new Error(
289-
`Failed to fetch row count: ${status} ${response.statusText} - ${JSON.stringify(errorData)}`
317+
`Failed to fetch sheet state: ${status} ${response.statusText} - ${JSON.stringify(errorData)}`
290318
)
291319
}
292320

293321
const data = await response.json()
294-
// values is [[row1col1, row1col2, ...], [row2col1, ...], ...] when majorDimension=ROWS.
295-
// The Sheets API omits trailing empty rows, so the array length is the last
296-
// non-empty row index (1-indexed), which is exactly what we need.
297-
const rows = data.values as string[][] | undefined
298-
return rows?.length ?? 0
299-
}
322+
const rows = (data.values as string[][] | undefined) ?? []
323+
const rowCount = rows.length
300324

301-
async function fetchHeaderRow(
302-
accessToken: string,
303-
spreadsheetId: string,
304-
sheetName: string,
305-
valueRenderOption: ValueRenderOption,
306-
dateTimeRenderOption: DateTimeRenderOption,
307-
requestId: string,
308-
logger: ReturnType<typeof import('@sim/logger').createLogger>
309-
): Promise<string[]> {
310-
const encodedSheet = encodeURIComponent(sheetName)
311-
const params = new URLSearchParams({
312-
fields: 'values',
313-
valueRenderOption,
314-
dateTimeRenderOption,
315-
})
316-
const url = `https://sheets.googleapis.com/v4/spreadsheets/${spreadsheetId}/values/${encodedSheet}!1:1?${params.toString()}`
317-
318-
const response = await fetch(url, {
319-
headers: { Authorization: `Bearer ${accessToken}` },
320-
})
321-
322-
if (!response.ok) {
323-
const status = response.status
324-
if (status === 403 || status === 429) {
325-
const errorData = await response.json().catch(() => ({}))
326-
throw new Error(
327-
`Sheets API rate limit (${status}) fetching header row — skipping to retry next poll cycle: ${JSON.stringify(errorData)}`
328-
)
325+
let headers: string[] = []
326+
let headerRowIndex = 0
327+
for (let i = 0; i < Math.min(rows.length, HEADER_SCAN_ROWS); i++) {
328+
const row = rows[i]
329+
if (row?.some((cell) => cell !== '')) {
330+
headers = row
331+
headerRowIndex = i + 1
332+
break
329333
}
330-
logger.warn(`[${requestId}] Failed to fetch header row, proceeding without headers`)
331-
return []
332334
}
333335

334-
const data = await response.json()
335-
return (data.values?.[0] as string[]) ?? []
336+
return { rowCount, headers, headerRowIndex }
336337
}
337338

338339
async function fetchRowRange(
@@ -361,13 +362,11 @@ async function fetchRowRange(
361362
if (!response.ok) {
362363
const status = response.status
363364
const errorData = await response.json().catch(() => ({}))
364-
365365
if (status === 403 || status === 429) {
366366
throw new Error(
367367
`Sheets API rate limit (${status}) — skipping to retry next poll cycle: ${JSON.stringify(errorData)}`
368368
)
369369
}
370-
371370
throw new Error(
372371
`Failed to fetch rows ${startRow}-${endRow}: ${status} ${response.statusText} - ${JSON.stringify(errorData)}`
373372
)
@@ -383,7 +382,6 @@ async function processRows(
383382
startRowIndex: number,
384383
spreadsheetId: string,
385384
sheetName: string,
386-
config: GoogleSheetsWebhookConfig,
387385
webhookData: PollWebhookContext['webhookData'],
388386
workflowData: PollWebhookContext['workflowData'],
389387
requestId: string,
@@ -394,7 +392,13 @@ async function processRows(
394392

395393
for (let i = 0; i < rows.length; i++) {
396394
const row = rows[i]
397-
const rowNumber = startRowIndex + i // startRowIndex is already the 1-indexed sheet row
395+
const rowNumber = startRowIndex + i
396+
397+
// Skip empty rows — don't fire a workflow run with no data.
398+
if (!row || row.length === 0) {
399+
logger.info(`[${requestId}] Skipping empty row ${rowNumber} for webhook ${webhookData.id}`)
400+
continue
401+
}
398402

399403
try {
400404
await pollingIdempotency.executeWithIdempotency(

0 commit comments

Comments
 (0)