目录文档-技术白皮书(V5.05)43-EFT.WP.Data.DatasetCards v1.0

第18章 附录:Dataset Card 模板


I. 模板范围与口径

两套可直接落盘的 YAML/JSON 卡片模板;字段命名一律 snake_case;引用采用“卷名 vX.Y:锚点”;单位体系与量纲校核遵循 SI 与 check_dim=true。 完整骨架模板最小模板提供

II. 最小模板(可直接复制)

# ===== Minimal Dataset Card (release-grade) =====

dataset_id: "<org.project.dataset>"

title: "<Human-readable Title>"

version: "v1.0"

summary: "<100–300 chars brief purpose, coverage, limitations>"

modality: ["time_series"] # radio|optical|image|time_series|text|tabular

sources: ["<doi:...>", "<dataset_id@vX.Y>"]

license: "<SPDX or policy>"

access: "open" # open|restricted|closed

provenance:

collection_method: "<simulation|survey|beamformed-array|...>"

time_coverage: "<YYYY-MM-DD..YYYY-MM-DD>"

# spatial_coverage / selection_bias 可选

splits:

train: {count: 0, ratio: 0.8}

validation: {count: 0, ratio: 0.1}

test: {count: 0, ratio: 0.1}

policy: {leakage_guard: ["per-object"], freeze_indices: true}

metrology:

units: "SI"

c_ref: 299792458

angle_unit: "deg"

time_standard: "UTC"

check_dim: true

quality:

gates:

- {name:"leakage", metric:"leakage_rate", threshold:0.0}

checksums: {}

export_manifest:

version: "v1.0"

artifacts: []

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

(引用采用“卷名 vX.Y:锚点”,与导出清单一致。)


III. 完整骨架模板(发布级,含可选扩展)

# ===== Full Dataset Card Skeleton =====

dataset_id: "<org.project.dataset>"

title: "<Human-readable Title>"

version: "v1.0.0"

summary: "<purpose, scope, coverage, limitations (100–300 chars)>"

modality: ["radio","time_series"]

sources: ["<doi:...>","<eift.obs.base@v1.2>"]

license: "<SPDX or policy>"

access: "open" # open|restricted|closed

provenance:

collection_method: "<beamformed-array|drift-scan|survey-aggregation|simulation>"

instruments: [{name:"<string>", station:"<string>", role:"rx"}]

time_coverage: "<YYYY-MM-DD..YYYY-MM-DD>"

spatial_coverage: "<RA/Dec ranges | CRS:EPSG:4326>"

selection_bias: "<flux-limited, SNR>=7>"

permits: ["<ethics/permit-ref>"]

sampling:

strategy: "<random|stratified|systematic|time-based|spatial-tiles>"

strata: [{by:"class", buckets: {"A":100,"B":200}}]

rates: {train:0.8, validation:0.1, test:0.1}

seed: 1701

replacement: false

dedup_policy: "per-object"

audits: ["coverage","leakage","class-imbalance"]

preprocess:

pipeline_id: "<pipeline-name>"

steps:

- name: "rfi_clean"

enabled: true

idempotent: true

params: {method:"spectral-kurtosis", window:256, thr_sigma:5}

inputs: ["raw_spec"]

outputs: ["mask_spec"]

parameter_lock: true

randomness: {seed: 1701, libraries:{numpy:"1.26.4"}}

environment: {os:"ubuntu22.04", containers:["ghcr.io/eift/card-prep:1.0.2"]}

audits: ["nan-check","range-check","leakage"]

labels:

schema_version: "v1.0"

taxonomy:

root: "event"

nodes:

- {id:"FRB", parent:"event", kind:"class", definition:"fast radio burst"}

- {id:"RFI", parent:"event", kind:"artifact", definition:"radio frequency interference"}

class_map: {include:["FRB","RFI"], exclude: []}

encoding:

type: "multi_class" # multi_class|multi_label|hierarchical

policy:

positive_rules: ["explicit-evidence"]

negative_rules: ["contradiction-or-missing-signal"]

tie_breaker: "lowest-risk"

multilingual:

default_lang: "en"

map: {FRB:{en:"FRB", zh:"快速射电暴"}, RFI:{en:"RFI", zh:"射频干扰"}}

metrology:

units: "SI"

c_ref: 299792458

time_standard: "UTC"

angle_unit: "deg"

check_dim: true

# —— 如涉路径依赖量(例如 T_arr),启用以下区块 ——

path_dependence:

applies_to: ["T_arr"]

delta_form: "const-factor" # or "general"

path: "gamma(ell)"

measure: "d ell"

see:

- "EFT.WP.Core.Equations v1.1:S20-1"

- "EFT.WP.Core.Metrology v1.0:check_dim"

uncertainty:

model: "GUM" # GUM|bayesian|montecarlo

components:

- {name:"thermal", type:"random", value:2.1, unit:"K", distribution:"normal", coverage:{k:1.0}}

- {name:"cal_gain", type:"systematic", value:0.8, unit:"%", distribution:"normal", coverage:{k:2.0}, corr_group:"instrument"}

correlation: {posture:"groups", groups:[{name:"instrument", pairwise:"rho=0.6"}]}

propagation: {rule:"linear", linearization:"first-order", samples:0}

coverage_policy: {target_p:0.95, k:2.0}

splits:

train: {count: 0, ratio: 0.8}

validation: {count: 0, ratio: 0.1}

test: {count: 0, ratio: 0.1}

policy:

leakage_guard: ["per-object","per-timewindow"]

stratify_by: ["class","region"]

freeze_indices: true

audit:

coverage: {by:"class", report:true}

leakage: {cross_split:"forbid"}

distribution:

packaging: {format:"tgz", shard_bytes:134217728, layout:["train","validation","test"]}

mirrors: ["https://mirror-a.example/foo/","s3://bucket/foo/"]

rate_limit: {mbps: 50}

regional_compliance: ["EU-GDPR"] # 如适用

checksums:

package: {sha256: "<hex>"}

shards:

- {path:"train-000.tgz", sha256:"<hex>"}

quality:

gates:

- {name:"label_consistency", metric:"kappa", threshold:0.98}

- {name:"leakage", metric:"leakage_rate", threshold:0.0}

- {name:"coverage_min", metric:"split_coverage", threshold:0.99}

coverage:

samples: 0

per_class: {}

ci_method: "bootstrap-bca"

target_ci: 0.95

baseline:

tasks: [{name:"cls_frb_vs_rfi", type:"classification", split:"test"}]

metrics: [{name:"f1_macro"}, {name:"roc_auc"}, {name:"ece"}, {name:"brier"}]

eval_protocol:

splits: "frozen"

seeds: [0,1,2,3,4]

repeats: 5

ci: {method:"bootstrap-bca", level:0.95}

significance: {test:"permutation", alpha:0.05}

robustness: {shift_tests:["snr_drop","time_jitter","spec_notch"]}

privacy:

policy: "no-PII" # no-PII|limited-PII|special-category

lawful_basis: ["research"]

data_minimization: true

ethics:

intended_use: ["academic","benchmark"]

prohibited_use: ["surveillance"]

release:

channel: "stable" # alpha|beta|rc|stable|yanked

version: "v1.0.0"

date: "2025-09-20"

compatibility: {baseline:"v1.*", backwards:"minor"}

export_manifest:

version: "v1.0"

artifacts:

- {path:"splits/train.index", sha256:"..."}

- {path:"quality/summary.csv", sha256:"..."}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

- "EFT.WP.Core.Equations v1.1:S20-1"

(单位/量纲与引用锚点的格式为强制要求。)


IV. 字段占位符与最小正则(便捷对照)


V. 导出清单模板(规范性)

export_manifest:

version: "v1.0"

artifacts:

- {path:"dataset_card.yaml", sha256:"<hex>"}

- {path:"splits/train.index", sha256:"<hex>"}

- {path:"packages/train-000.tgz", sha256:"<hex>"}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

(导出物须逐项可验;引用携带“卷名+版本+锚点”。)


VI. 发布前阻断自检(清单)


VII. 机器可读空白模板(无注释版,适合 CI 生成)

dataset_id: ""

title: ""

version: "v1.0"

summary: ""

modality: []

sources: []

license: ""

access: "open"

provenance: {collection_method:"", time_coverage:""}

sampling: {}

preprocess: {pipeline_id:"", steps: [], parameter_lock: true}

labels: {}

metrology: {units:"SI", c_ref:299792458, time_standard:"UTC", angle_unit:"deg", check_dim:true}

uncertainty: {}

splits: {train:{count:0,ratio:0.8}, validation:{count:0,ratio:0.1}, test:{count:0,ratio:0.1}, policy:{leakage_guard:["per-object"], freeze_indices:true}}

distribution: {}

quality: {}

privacy: {}

ethics: {}

release: {channel:"stable", version:"v1.0.0", date:"2025-09-20", compatibility:{baseline:"v1.*", backwards:"minor"}}

export_manifest: {version:"v1.0", artifacts: [], references:["EFT.WP.Core.DataSpec v1.0:EXPORT","EFT.WP.Core.Metrology v1.0:check_dim"]}

(可直接用于自动化生成,再按需要增量填充。)


版权与许可:除另有说明外,《能量丝理论》(含文本、图表、插图、符号与公式)的著作权由作者(屠广林)享有。
许可方式(CC BY 4.0):在注明作者与来源的前提下,允许复制、转载、节选、改编与再分发。
署名格式(建议):作者:屠广林|作品:《能量丝理论》|来源:energyfilament.org|许可证:CC BY 4.0
验证召集: 作者独立自费、无雇主无资助;下一阶段将优先在最愿意公开讨论、公开复现、公开挑错的环境中推进落地,不限国家。欢迎各国媒体与同行抓住窗口组织验证,并与我们联系。
版本信息: 首次发布:2025-11-11 | 当前版本:v6.0+5.05