目录文档-技术白皮书(V5.05)43-EFT.WP.Data.DatasetCards v1.0

第17章 示例与最佳实践


I. 章节目的与范围

于公式。 禁用中文提供从最小卡片到完整卡片的端到端示例,覆盖来源/采样、清洗/预处理、标签/本体、计量/不确定度、切分/分发、质量与基线、隐私与合规、发布与版本化,以及机器可读 Schema/Lint 与 API 落地;并给出常见失败模式与修复策略。键名统一 snake_case,引用采用“卷名+版本+锚点”,数学表达使用反引号与括号,

II. 最小可用卡片(通过 Schema 与 Lint 的发布级示例)

dataset_id: "eift.obs.demo"

title: "EIFT Demo Dataset"

version: "v1.0"

summary: "Minimal, release-grade card passing schema and lint with SI metrology and frozen splits."

modality: ["time_series"]

sources: ["doi:10.1234/demo"]

license: "CC-BY-4.0"

access: "open"

provenance:

collection_method: "simulation"

time_coverage: "2024-01-01..2024-12-31"

selection_bias: "none"

sampling:

strategy: "random"

rates: {train:0.8, validation:0.1, test:0.1}

seed: 1701

splits:

train: {count: 800, ratio: 0.8}

validation: {count: 100, ratio: 0.1}

test: {count: 100, ratio: 0.1}

policy: {leakage_guard: ["per-object"], freeze_indices: true}

preprocess:

pipeline_id: "prep-minimal-v1"

steps:

- {name:"normalize", enabled:true, idempotent:true,

params:{type:"zscore", stats_from:"train-only"}}

labels:

schema_version: "v1.0"

taxonomy:

root: "event"

nodes:

- {id:"Signal", parent:"event", kind:"class", definition:"presence of pattern"}

- {id:"Noise", parent:"event", kind:"background"}

class_map: {include:["Signal","Noise"], exclude: []}

encoding: {type:"multi_class", policy:{positive_rules:["explicit-evidence"], negative_rules:["contradiction-or-missing-signal"], tie_breaker:"lowest-risk"}}

metrology:

units: "SI"

c_ref: 299792458

time_standard: "UTC"

angle_unit: "deg"

check_dim: true

quality:

gates:

- {name:"leakage", metric:"leakage_rate", threshold:0.0}

- {name:"coverage_min", metric:"split_coverage", threshold:0.99}

coverage:

samples: 1000

per_class: {"Signal": 420, "Noise": 580}

ci_method: "bootstrap-bca"

target_ci: 0.95

baseline:

tasks: [{name:"cls_signal_vs_noise", type:"classification", split:"test"}]

metrics: [{name:"f1_macro"}, {name:"roc_auc"}]

eval_protocol: {splits:"frozen", seeds:[0,1,2,3,4], repeats:5,

ci:{method:"bootstrap-bca", level:0.95},

significance:{test:"permutation", alpha:0.05}}

checksums: {}

export_manifest:

version: "v1.0"

artifacts: []

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

(引用锚点采用“卷名 vX.Y:锚点”,与导出清单口径一致。)


III. 路径依赖量(含 T_arr)的完整示例

dataset_id: "eift.radio.toa-set"

title: "Arrival-Time (TOA) Measurement Set"

version: "v1.2"

summary: "Path-dependent arrival-time dataset with iono/tropo/instrumental corrections and full uncertainty budget."

modality: ["radio","time_series"]

sources: ["doi:10.5678/toa2025"]

provenance:

collection_method: "beamformed-array"

instruments: [{name:"LOFAR", station:"DE601"}]

time_coverage: "2023-01-01..2025-06-30"

spatial_coverage: "RA[120..240],Dec[-30..+30]"

selection_bias: "flux-limited, SNR>=7"

sampling:

strategy: "stratified"

strata: [{by:"snr_bin", buckets:{"7-10":300,"10-20":500,"20+":700}}]

rates: {train:0.8, validation:0.1, test:0.1}

seed: 1701

audits: ["coverage","leakage","class-imbalance"]

preprocess:

pipeline_id: "toa-prep-v2"

steps:

- {name:"rfi_clean", enabled:true, idempotent:true,

params:{method:"spectral-kurtosis", window:256, thr_sigma:5}}

- {name:"filter", enabled:true, idempotent:true,

params:{type:"bandpass", f_lo_hz:1.2e6, f_hi_hz:3.8e6, order:5, phase:"zero"}}

- {name:"normalize", enabled:true, idempotent:true,

params:{type:"zscore", stats_from:"train-only"}}

labels:

schema_version: "v1.0"

taxonomy:

root: "event"

nodes:

- {id:"FRB", parent:"event", kind:"class", definition:"fast radio burst"}

- {id:"RFI", parent:"event", kind:"artifact", definition:"radio frequency interference"}

class_map: {include:["FRB","RFI"], exclude: []}

encoding: {type:"multi_class", policy:{positive_rules:["explicit-evidence"], negative_rules:["contradiction-or-missing-signal"], tie_breaker:"lowest-risk"}}

path_dependence:

applies_to: ["T_arr"]

delta_form: "const-factor" # or "general"

path: "gamma(ell)"

measure: "d ell"

see:

- "EFT.WP.Core.Equations v1.1:S20-1"

- "EFT.WP.Core.Metrology v1.0:check_dim"

metrology:

units: "SI"

c_ref: 299792458

time_standard: "UTC"

angle_unit: "deg"

check_dim: true

uncertainty:

model: "GUM"

components:

- {name:"thermal", type:"random", value:2.1, unit:"K", distribution:"normal", coverage:{k:1.0}}

- {name:"iono", type:"systematic", value:0.7e-9, unit:"s", distribution:"normal", coverage:{k:2.0}, corr_group:"path"}

- {name:"tropo", type:"systematic", value:0.3e-9, unit:"s", distribution:"normal", coverage:{k:2.0}, corr_group:"path"}

correlation: {posture:"groups", groups:[{name:"path", pairwise:"rho=0.5"}]}

propagation: {rule:"linear", linearization:"first-order"}

coverage_policy: {target_p:0.95, k:2.0}

splits:

train: {count: 12000, ratio: 0.8}

validation: {count: 1500, ratio: 0.1}

test: {count: 1500, ratio: 0.1}

policy: {leakage_guard:["per-object","per-timewindow"], stratify_by:["snr_bin"], freeze_indices:true}

distribution:

packaging: {format:"tgz", shard_bytes:134217728, layout:["train","validation","test"]}

mirrors: ["https://mirror-a.example/toa/","s3://bucket/toa/"]

checksums:

package: {sha256:"..."}

shards:

- {path:"train-000.tgz", sha256:"..."}

quality:

gates:

- {name:"leakage", metric:"leakage_rate", threshold:0.0}

- {name:"coverage_min", metric:"split_coverage", threshold:0.99}

baseline:

tasks: [{name:"cls_frb_vs_rfi", type:"classification", split:"test"}]

metrics: [{name:"f1_macro"}, {name:"roc_auc"}, {name:"ece"}]

eval_protocol: {splits:"frozen", seeds:[0,1,2,3,4], repeats:5,

ci:{method:"bootstrap-bca", level:0.95},

significance:{test:"permutation", alpha:0.05}}

privacy:

policy: "no-PII"

export_manifest:

version: "v1.2"

artifacts:

- {path:"splits/train.index", sha256:"..."}

- {path:"packages/train-000.tgz", sha256:"..."}

- {path:"quality/summary.csv", sha256:"..."}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

- "EFT.WP.Core.Equations v1.1:S20-1"

(路径/测度登记与两种 T_arr 等价表达按计量口径执行。)


IV. 常见失败模式与修复策略(对照 Lint 规则)


V. 最佳实践清单(发布必读)


VI. 与验证 API 的联动示例

# 结构与跨卷校验

curl -s -X POST https://api.eift.org/api/v1/validate_card \

-H "Authorization: Bearer <token>" \

-H "x-eift-idempotency: a0d7b6c4-9478-4e4a-9f13-8c1b9f77f111" \

-H "Content-Type: application/json" \

-d @card.json

# 计量一致性

curl -s -X POST https://api.eift.org/api/v1/check_units \

-H "Authorization: Bearer <token>" -H "Content-Type: application/json" \

-d @card.json

(API 返回统一结构,阻断项在发布前清零。)


VII. 附:失败—修复对照表(可纳入 CI 日志)

失败规则

典型症状

修复要点

参照条目

SPLIT.RATIO_SUM

三分比例之和≠1

对齐 sampling.rates 与 splits

第11章;Lint 规则集

REFERENCES.FORMAT

引用缺版本/锚点

改为“卷名 vX.Y:锚点”

引用规范;DataSpec 导出

METROLOGY.SI_AND_CHECKDIM

单位不一致/未启用校核

设 units="SI", check_dim=true

Core.Metrology v1.0

PATH.TARR_FIELDS

T_arr 未登记路径/测度

填 delta_form/path/measure

Core.Equations v1.1:S20-*

SYMBOLS.CONFLICT

T_fil/T_trans 混用

分离张力/透射系数

写作清单口径


VIII. 本章合规自检


版权与许可:除另有说明外,《能量丝理论》(含文本、图表、插图、符号与公式)的著作权由作者(屠广林)享有。
许可方式(CC BY 4.0):在注明作者与来源的前提下,允许复制、转载、节选、改编与再分发。
署名格式(建议):作者:屠广林|作品:《能量丝理论》|来源:energyfilament.org|许可证:CC BY 4.0
验证召集: 作者独立自费、无雇主无资助;下一阶段将优先在最愿意公开讨论、公开复现、公开挑错的环境中推进落地,不限国家。欢迎各国媒体与同行抓住窗口组织验证,并与我们联系。
版本信息: 首次发布:2025-11-11 | 当前版本:v6.0+5.05