git4data-tutorial/08-ml-incremental/ml_incremental.sql at main · matrixorigin/git4data-tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
-- =============================================================================
-- Git4Data Tutorial — Part 8: ML Continuous Learning — train only the delta
-- Snapshot per training run; DIFF between runs yields exactly the rows that
-- changed — feed those to incremental training instead of full retrains.
-- Run:  mysql -h 127.0.0.1 -P 6001 -u root -p111 < ml_incremental.sql
-- =============================================================================

CREATE DATABASE mltrain_demo;
USE mltrain_demo;

-- The growing training set + a model registry that pins data versions.
CREATE TABLE samples (
    sample_id BIGINT PRIMARY KEY,
    features  VARCHAR(128),
    label     INT,
    src       VARCHAR(16)
);
CREATE TABLE model_registry (
    model    VARCHAR(32) PRIMARY KEY,
    data_tag VARCHAR(64),
    metric   DECIMAL(6,4)
);

INSERT INTO samples
SELECT result, concat('feat_', result), result % 2, 'batch_0'
FROM generate_series(1, 100000) g;

-- ------------------------------------------------- training run #1
-- Pin the exact data this model sees, then register the pair.
CREATE SNAPSHOT train_v1 FOR TABLE mltrain_demo samples;
-- (your trainer reads the table, fits model m1 ...)
INSERT INTO model_registry VALUES ('m1', 'train_v1', 0.9012);

-- ------------------------------------------------- the data keeps moving
-- A week of life: new batch arrives, some labels get corrected.
INSERT INTO samples
SELECT 100000 + result, concat('feat_n_', result), result % 2, 'batch_1'
FROM generate_series(1, 3000) g;
UPDATE samples SET label = 1 - label WHERE sample_id BETWEEN 500 AND 699;  -- 200 fixes

-- ------------------------------------------------- train only the delta
-- What changed since m1's training set? EXACTLY these rows — nothing else.
DATA BRANCH DIFF samples AGAINST samples {SNAPSHOT='train_v1'} OUTPUT SUMMARY;
--   INSERTED = 3000 (new batch), UPDATED = 200 (label fixes)
-- Your trainer pulls just these 3200 rows (OUTPUT LIMIT / OUTPUT FILE) and
-- calls partial_fit — instead of re-reading all 103,000.
-- Measured in our experiments: 6 rounds of incremental training processed
-- 6,012 rows total where full retrains would have processed 21,000.

-- ------------------------------------------------- training run #2
CREATE SNAPSHOT train_v2 FOR TABLE mltrain_demo samples;
INSERT INTO model_registry VALUES ('m2', 'train_v2', 0.9145);

-- ------------------------------------------------- exact reproduction
-- Months later: "rebuild m1's training set, bit for bit."
SELECT COUNT(*) FROM samples {snapshot='train_v1'};   -- 100000, as m1 saw it
SELECT COUNT(*) FROM samples;                          -- 103000, the present
-- Registry tells you which tag belongs to which model:
SELECT * FROM model_registry ORDER BY model;

-- ---------------------------------------------------------------- cleanup
DROP SNAPSHOT IF EXISTS train_v1;
DROP SNAPSHOT IF EXISTS train_v2;
DROP DATABASE IF EXISTS mltrain_demo;