目录文档-技术白皮书45-EFT.WP.Data.Pipeline v1.0

第15章 容错性、恢复与灾备


I. 章节目的与范围

的规范:失败语义与补偿、重试与超时、幂等与去重、检查点与快照、备份与回放、RTO/RPO 与演练、跨可用区/跨区域切换与回切、导出物与审计;确保与契约/调度/监控/计量章一致。灾备(disaster recovery, DR)恢复(recovery)容错(fault tolerance)固化流水线

II. 术语与依赖


III. 字段与结构(规范性)

fault_tolerance:

semantics:

on_fail: "retry|skip|quarantine|block"

error_classes: ["retryable","non_retryable","escalate"]

retry:

policy: {max: 3, backoff: "expo", jitter_ms: 200}

timeout_s: 1800

idempotency:

enabled: true

dedupe_key: ["<pk>","<offset|ts>"]

sink_mode: "idempotent-insert|upsert"

compensation:

enabled: true

handlers:

- {stage:"transform.normalize", action:"reverse_op", spec:"comp/normalize.reverse.yaml"}

- {stage:"feature.map", action:"delete_artifact", spec:"comp/delete.manifest.yaml"}

recovery:

checkpoint:

mode: "exactly-once|at-least-once"

store: "s3://.../chk/<stage>"

cadence: "PT5M"

contents: ["offset","cursor","watermark","sink_commit"]

snapshot:

enabled: true

store: "s3://.../snap/<dataset>"

cadence: "P1D"

retention: "P30D"

replay:

enabled: true

inputs_lock: "locks/inputs.manifest.json"

policy: "strict|lenient"

rollbacks:

guardrail: {max_depth: 2, require_approval: true}

dr:

strategy: "active-active|active-passive"

topology:

primary: {region:"eu-west-1", azs:["a","b"], quorum:3}

standby: {region:"eu-central-1", azs:["a","b"], quorum:3}

rto: "PT30M"

rpo: "PT5M"

failover:

trigger: "manual|auto"

health_checks: ["latency_ms.p99","error_rate","heartbeat"]

dns_ttl_s: 60

fallback:

criteria: ["primary_healthy_24h","replication_lag<PT1M"]

testing:

chaos:

enabled: true

experiments:

- {name:"kill-worker", scope:"stage", percent:10}

- {name:"net-partition", scope:"cluster", duration_s:300}

- {name:"disk-throttle", scope:"node", mbps:50}

drills:

schedule: "quarterly"

playbooks: ["dr/runbook.md","rollback/runbook.md"]

success_criteria: ["rto_met","rpo_met","no_data_loss","alerting_ok"]

backups:

datasets: ["feat_rows","train_pkg"]

cadence: "P1D"

store: "s3://.../backup"

encryption: "SSE-KMS"

integrity: {hash:"sha256", manifest:"backup/manifest.json"}


IV. 失败语义、重试与幂等


V. 补偿、回滚与回放


VI. 检查点、快照与备份


VII. 灾备策略与切换


VIII. 混沌实验、演练与成功准则


IX. 计量与单位(SI)

  1. 性能与目标:RTO、RPO、T_inf(ms)、QPS(1/s)、ρ(—);带宽 net_mbps、体量 size_bytes;
  2. 强制:metrology:{units:"SI", check_dim:true};合成/换算前先做单位归一
  3. 路径量:若容错/恢复流程处理 T_arr,需登记 delta_form、path="gamma(ell)"、measure="d ell",并采用:
    • T_arr = ( 1 / c_ref ) * ( ∫ n_eff d ell ) 或
    • T_arr = ( ∫ ( n_eff / c_ref ) d ell ),通过 check_dim 校核。

X. 机器可读片段(可直接嵌入)

fault_tolerance:

semantics: {on_fail:"retry", error_classes:["retryable","non_retryable","escalate"]}

retry: {policy:{max:3, backoff:"expo", jitter_ms:200}, timeout_s:1800}

idempotency: {enabled:true, dedupe_key:["id","updated_at"], sink_mode:"upsert"}

compensation:

enabled: true

handlers:

- {stage:"feature.map", action:"delete_artifact", spec:"comp/delete.manifest.yaml"}

recovery:

checkpoint: {mode:"exactly-once", store:"s3://meta/chk/feat.map", cadence:"PT5M",

contents:["offset","cursor","watermark","sink_commit"]}

snapshot: {enabled:true, store:"s3://snap/feat_rows", cadence:"P1D", retention:"P30D"}

replay: {enabled:true, inputs_lock:"locks/inputs.manifest.json", policy:"strict"}

rollbacks: {guardrail:{max_depth:2, require_approval:true}}

dr:

strategy: "active-passive"

topology:

primary: {region:"eu-west-1", azs:["a","b"], quorum:3}

standby: {region:"eu-central-1", azs:["a","b"], quorum:3}

rto: "PT30M"

rpo: "PT5M"

failover: {trigger:"auto", health_checks:["latency_ms.p99","error_rate","heartbeat"], dns_ttl_s:60}

fallback: {criteria:["primary_healthy_24h","replication_lag<PT1M"]}

testing:

chaos: {enabled:true, experiments:[{name:"kill-worker",scope:"stage",percent:10}]}

drills: {schedule:"quarterly", playbooks:["dr/runbook.md"], success_criteria:["rto_met","rpo_met","no_data_loss"]}

backups:

datasets: ["feat_rows","train_pkg"]

cadence: "P1D"

store: "s3://backup"

encryption: "SSE-KMS"

integrity: {hash:"sha256", manifest:"backup/manifest.json"}

metrology: {units:"SI", check_dim:true}


XI. Lint 规则(节选,规范性)

lint_rules:

- id: FT.IDEMPOTENCY_REQUIRED

when: "$.fault_tolerance.idempotency.enabled"

assert: "value == true"

level: error

- id: RC.CHECKPOINT_DEFINED

when: "$.recovery.checkpoint"

assert: "has_keys(mode, store, cadence)"

level: error

- id: DR.RTO_RPO_DEFINED

when: "$.dr"

assert: "has_keys(rto, rpo) and duration_valid(rto) and duration_valid(rpo)"

level: error

- id: DR.STRATEGY_ALLOWED

when: "$.dr.strategy"

assert: "value in ['active-active','active-passive']"

level: error

- id: TEST.DRILLS_SCHEDULED

when: "$.testing.drills.schedule"

assert: "matches('^(monthly|quarterly|biannual|annual)$') or duration_valid(value)"

level: error

- id: BKP.INTEGRITY_MANIFEST

when: "$.backups"

assert: "has_keys(store, cadence, integrity)"

level: error

- id: METROLOGY.SI_AND_CHECKDIM

when: "$.metrology"

assert: "units == 'SI' and check_dim == true"

level: error


XII. 导出清单与审计

export_manifest:

version: "v1.0"

artifacts:

- {path:"chk/catalog.json", sha256:"..."}

- {path:"snap/retention.policy", sha256:"..."}

- {path:"dr/runbook.md", sha256:"..."}

- {path:"dr/drill_reports/2025Q3.md", sha256:"..."}

- {path:"backup/manifest.json", sha256:"..."}

- {path:"comp/normalize.reverse.yaml", sha256:"..."}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

- "EFT.WP.Data.Pipeline v1.0:Ch.11"

- "EFT.WP.Data.Pipeline v1.0:Ch.12"


XIII. 本章合规自检


版权与许可(CC BY 4.0)

版权声明:除另有说明外,《能量丝理论》(含文本、图表、插图、符号与公式)的著作权由作者(“屠广林”先生)享有。
许可方式:本作品采用 Creative Commons 署名 4.0 国际许可协议(CC BY 4.0)进行许可;在注明作者与来源的前提下,允许为商业或非商业目的进行复制、转载、节选、改编与再分发。
署名格式(建议):作者:“屠广林”;作品:《能量丝理论》;来源:energyfilament.org;许可证:CC BY 4.0。

首次发布: 2025-11-11|当前版本:v5.1
协议链接:https://creativecommons.org/licenses/by/4.0/