目录文档-技术白皮书46-EFT.WP.Data.Benchmarks v1.0

第18章 附录:Benchmark 模板


I. 模板范围与口径

泄漏护栏冻结切分两套可直接落盘的 YAML/JSON 模板;键名统一 snake_case;跨卷引用采用“卷名 vX.Y:锚点”;单位体系遵循 SI 与 check_dim=true;评测一律使用完整骨架模板最小模板提供

II. 最小模板(可直接复制)

suite:

id: "eift.bench.core"

title: "EIFT Core Benchmarks"

version: "v1.0"

modalities: ["text"]

tasks:

- id: "cls.binary"

io_mode: "offline"

dataset_ref: "datasets/core_cls@v1.0"

splits:

train: {frozen:true, index:"splits/train.index", sha256:"<hex>"}

val: {frozen:true, index:"splits/val.index", sha256:"<hex>"}

test: {frozen:true, index:"splits/test.index", sha256:"<hex>"}

ratio: {train:0.8, val:0.1, test:0.1}

freeze_indices: true

leakage_guard: ["per-object"]

protocol:

mode: "offline"

seed: 1701

repeats: 5

metrics:

- {name:"F1_macro", family:"classification", unit:"—", higher_is_better:true, agg:"macro"}

aggregation: {levels:["task"], weights:{scheme:"uniform"}}

significance: {method:"bootstrap", alpha:0.05}

metrology: {units:"SI", check_dim:true}

export_manifest:

version: "v1.0"

artifacts:

- {path:"benchmark.yaml", sha256:"<hex>"}

- {path:"splits/train.index", sha256:"<hex>"}

- {path:"splits/val.index", sha256:"<hex>"}

- {path:"splits/test.index", sha256:"<hex>"}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"


III. 完整骨架模板(发布级,含可选扩展)

suite:

id: "<org.project.bench>"

title: "<Human-readable Title>"

version: "v1.0.0"

modalities: ["text","image","audio"]

risks: ["leakage","bias","spurious_correlation"]

coverage_matrix:

modality: {"text": 9000, "image": 6000, "audio": 3000} # 计数或 %

locale: {"en": 60, "zh": 20, "es": 20} # %

domain: {"news": 40, "science": 30, "open": 30} # %

tasks:

- id: "<task.id>"

title: "<Task Title>"

io_mode: "offline|online|stream|interactive"

evaluatee: "model|system|pipeline"

dataset_ref: "datasets/<name>@vX.Y"

sampling:

strategy: "random|stratified|time-based|spatial-tiles|systematic"

strata: [{by:"<label|locale|domain|difficulty>", buckets: {"A":100,"B":200}}]

seed: 1701

splits:

train: {frozen:true, index:"splits/train.index", sha256:"<hex>"}

val: {frozen:true, index:"splits/val.index", sha256:"<hex>"}

test: {frozen:true, index:"splits/test.index", sha256:"<hex>"}

ratio: {train:0.8, val:0.1, test:0.1}

freeze_indices: true

leakage_guard: ["per-object","per-timewindow","per-scene"]

protocol:

mode: "offline|online|stream|interactive"

seed: 1701

repeats: 5

temperature: 0.0

context: {length: 4096, template_ref: "prompts/<id>@vX.Y"}

tools: {allowed:false, retrieval:false, open_book:false, registry_ref:null}

runtime_limits: {timeout_s:3600, memory_gb:16}

execution: {concurrency:8, rate_limit_qps:50, batching:{enabled:true, max_batch:32}}

stream: {window_ms:1000, hop_ms:250, max_latency_ms:200, watermark:"event_time"}

interactive: {rounds:3, turn_timeout_s:30, max_context_turns:8}

online: {traffic_allocation:{control:0.5,treatment:0.5}, exposure:{shadow:true, canary:0.05}, guardrails:["latency_ms.p99<=200","error_rate<=0.01"]}

logging: {format:"jsonl", fields:["ts","task_id","item_id","run_id","latency_ms"], retention:"P30D"}

reporting: {metrics:["F1_macro","ECE","latency_ms.p99","QPS"], target_ci:{method:"bootstrap", level:0.95}}

metrics:

- {name:"F1_macro", family:"classification", unit:"—", higher_is_better:true, agg:"macro", window:"N/A"}

- {name:"ECE", family:"calibration", unit:"—", higher_is_better:false, agg:"mean", window:"N/A"}

- {name:"latency_ms.p99", family:"perf", unit:"ms", higher_is_better:false, agg:"quant", window:"1m"}

aggregation:

levels: ["task","suite"]

weights: {scheme:"uniform|sample_share|expert", w_i:null}

metrics:

include: ["F1_macro","ECE","latency_ms.p99"]

directions: {F1_macro:"max", ECE:"min", "latency_ms.p99":"min"}

combine: "weighted_mean|geomean|harmonic"

normalization:

scheme: "zscore|minmax|fixed-anchor"

params:

zscore: {μ_ref:"suite|task|anchors", σ_ref:"suite|task|anchors"}

minmax: {min_ref:"anchors", max_ref:"anchors"}

fixed_anchor:

anchors: ["baseline.logreg","baseline.rf"]

anchor_scores:

baseline.logreg: {F1_macro:0.72, ECE:0.06, "latency_ms.p99":180}

baseline.rf: {F1_macro:0.75, ECE:0.05, "latency_ms.p99":170}

significance:

method: "bootstrap|permutation|t|bayes"

B: 10000

alpha: 0.05

correction: "Holm-Bonferroni|BH|none"

robustness:

shift_tests:

- {name:"snr_drop", severity:[3,6,9], unit:"dB", policy:"additive-noise"}

thresholds: {drop_rel_max:0.10, acc_robust_min:0.80}

fairness_ethics:

slices: [{axis:"locale", buckets:["en","zh","es"]}]

gap_metric: "abs_diff|ratio|stat_parity|eq_opp"

thresholds: {fairness_warn:0.03, fairness_block:0.05}

env:

hardware: {cpu:"16c@3.0GHz", mem_gb:64, gpu:"0|A100-40GB:x1", storage:{type:"nvme", iops:">=50k", size_gb:512}, network:{nic_gbps:10}}

os: "ubuntu-22.04"

kernel: "linux-6.8"

containers: ["ghcr.io/eift/runner@sha256:<hex>"]

deps_lock: "env.lock"

metrology: {units:"SI", check_dim:true}

export_manifest:

version: "v1.0"

artifacts:

- {path:"benchmark.yaml", sha256:"<hex>"}

- {path:"splits/train.index", sha256:"<hex>"}

- {path:"splits/val.index", sha256:"<hex>"}

- {path:"splits/test.index", sha256:"<hex>"}

- {path:"reports/summary.json", sha256:"<hex>"}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

- "EFT.WP.Data.ModelCards v1.0:Ch.11"

- "EFT.WP.Data.Benchmarks v1.0:Ch.6"


IV. 字段占位符与最小正则(速查)


V. 导出清单模板(规范性)

export_manifest:

version: "v1.0"

artifacts:

- {path:"benchmark.yaml", sha256:"<hex>"}

- {path:"splits/train.index", sha256:"<hex>"}

- {path:"splits/val.index", sha256:"<hex>"}

- {path:"splits/test.index", sha256:"<hex>"}

- {path:"reports/summary.json", sha256:"<hex>"}

- {path:"reports/leaderboard.csv", sha256:"<hex>"}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

- "EFT.WP.Data.ModelCards v1.0:Ch.11"

- "EFT.WP.Data.Benchmarks v1.0:Ch.8"

- "EFT.WP.Data.Benchmarks v1.0:Ch.9"


VI. 发布前阻断自检(清单)


VII. 机器可读空白模板(无注释,CI 友好)

suite: {id:"", title:"", version:"v1.0", modalities:[]}

tasks: []

metrology: {units:"SI", check_dim:true}

export_manifest: {version:"v1.0", artifacts:[], references:["EFT.WP.Core.DataSpec v1.0:EXPORT","EFT.WP.Core.Metrology v1.0:check_dim"]}


版权与许可(CC BY 4.0)

版权声明:除另有说明外,《能量丝理论》(含文本、图表、插图、符号与公式)的著作权由作者(“屠广林”先生)享有。
许可方式:本作品采用 Creative Commons 署名 4.0 国际许可协议(CC BY 4.0)进行许可;在注明作者与来源的前提下,允许为商业或非商业目的进行复制、转载、节选、改编与再分发。
署名格式(建议):作者:“屠广林”;作品:《能量丝理论》;来源:energyfilament.org;许可证:CC BY 4.0。

首次发布: 2025-11-11|当前版本:v5.1
协议链接:https://creativecommons.org/licenses/by/4.0/