From d80d1cf5539880265ad5eaa8a3fdea0da1337c08 Mon Sep 17 00:00:00 2001
From: yasinshaw <yainshaw@gmail.com>
Date: Sun, 1 Mar 2026 00:06:28 +0800
Subject: [PATCH] feat: add infrastructure interview questions

Add comprehensive interview materials for:
- Service Mesh (Istio, Linkerd)
- RPC Framework (Dubbo, gRPC)
- Container Orchestration (Kubernetes)
- CI/CD (Jenkins, GitLab CI, GitHub Actions)
- Observability (Monitoring, Logging, Tracing)

Each file includes:
- 5-10 core questions
- Detailed standard answers
- Code examples
- Real-world project experience
- Alibaba P7 bonus points

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 questions/ci-cd.md                   | 1338 ++++++++++++++++++++++++++
 questions/container-orchestration.md | 1020 ++++++++++++++++++++
 questions/observability.md           | 1179 +++++++++++++++++++++++
 questions/rpc-framework.md           |  745 ++++++++++++++
 questions/service-mesh.md            |  605 ++++++++++++
 5 files changed, 4887 insertions(+)
 create mode 100644 questions/ci-cd.md
 create mode 100644 questions/container-orchestration.md
 create mode 100644 questions/observability.md
 create mode 100644 questions/rpc-framework.md
 create mode 100644 questions/service-mesh.md

diff --git a/questions/ci-cd.md b/questions/ci-cd.md
new file mode 100644
index 0000000..2b83706
--- /dev/null
+++ b/questions/ci-cd.md
@@ -0,0 +1,1338 @@
+# CI/CD (持续集成/持续部署)
+
+## 问题
+
+**背景**：CI/CD 是现代软件开发的核心实践，通过自动化构建、测试和部署，提高软件交付速度和质量。
+
+**问题**：
+1. 什么是 CI/CD？它解决了哪些问题？
+2. Jenkins Pipeline 的核心概念是什么？
+3. 请描述一个完整的 CI/CD 流程
+4. GitLab CI 和 GitHub Actions 的区别是什么？
+5. 如何实现蓝绿部署和金丝雀发布？
+6. CI/CD 中的环境变量和密钥如何管理？
+7. 如何实现基础设施即代码（IaC）？
+8. CI/CD 流水线如何集成测试（单元测试、集成测试、E2E 测试）？
+9. 如何回滚失败的部署？
+10. 在实际项目中如何设计 CI/CD 流水线？
+
+---
+
+## 标准答案
+
+### 1. CI/CD 概述
+
+#### **定义**：
+```
+CI (Continuous Integration，持续集成)：
+- 开发人员频繁提交代码到共享仓库
+- 每次提交都自动触发构建和测试
+- 及早发现集成错误
+
+CD (Continuous Delivery，持续交付)：
+- 代码通过测试后自动部署到 staging 环境
+- 随时可以部署到生产环境
+- 需要手动触发生产部署
+
+CD (Continuous Deployment，持续部署)：
+- 代码通过测试后自动部署到生产环境
+- 无需人工干预
+- 完全自动化
+```
+
+#### **解决的问题**：
+```
+传统开发痛点：
+├─ 集成困难（大量代码合并冲突）
+├─ 测试反馈慢（手动测试耗时长）
+├─ 部署风险高（手动部署容易出错）
+├─ 交付周期长（从开发到上线耗时数周）
+└─ 回滚困难（出问题难以快速恢复）
+
+CI/CD 解决方案：
+├─ 自动化构建和测试（每次提交自动运行）
+├─ 快速反馈（几分钟内知道测试结果）
+├─ 自动化部署（一键部署到任何环境）
+├─ 快速交付（每天多次部署）
+└─ 易于回滚（保留历史版本，快速回滚）
+```
+
+---
+
+### 2. Jenkins Pipeline 核心概念
+
+#### **Pipeline 类型**：
+```groovy
+// 1. Declarative Pipeline（声明式，推荐）
+pipeline {
+    agent any
+
+    stages {
+        stage('Build') {
+            steps {
+                sh 'mvn clean package'
+            }
+        }
+        stage('Test') {
+            steps {
+                sh 'mvn test'
+            }
+        }
+        stage('Deploy') {
+            steps {
+                sh 'kubectl apply -f k8s/'
+            }
+        }
+    }
+
+    post {
+        success {
+            echo 'Pipeline succeeded!'
+        }
+        failure {
+            echo 'Pipeline failed!'
+        }
+    }
+}
+
+// 2. Scripted Pipeline（脚本式，灵活）
+node {
+    stage('Build') {
+        sh 'mvn clean package'
+    }
+    stage('Test') {
+        sh 'mvn test'
+    }
+    stage('Deploy') {
+        sh 'kubectl apply -f k8s/'
+    }
+}
+```
+
+#### **核心概念**：
+
+**1. Agent（代理）**
+```groovy
+// 任意可用 agent
+agent any
+
+// 指定标签
+agent { label 'linux' }
+
+// Docker agent
+agent {
+    docker {
+        image 'maven:3.8.1-openjdk-11'
+        args '-v $HOME/.m2:/root/.m2'
+    }
+}
+
+// Kubernetes agent（Pod Template）
+agent {
+    kubernetes {
+        yaml '''
+        spec:
+          containers:
+          - name: maven
+            image: maven:3.8.1-openjdk-11
+            command: ["cat"]
+            tty: true
+        '''
+    }
+}
+```
+
+**2. Stages（阶段）**
+```groovy
+stages {
+    stage('Build') {
+        when {
+            branch 'main'  // 只在 main 分支执行
+        }
+        steps {
+            sh 'mvn clean package'
+        }
+    }
+
+    stage('Deploy to Staging') {
+        when {
+            branch 'develop'
+        }
+        steps {
+            sh 'kubectl apply -f k8s/staging/'
+        }
+    }
+
+    stage('Deploy to Production') {
+        when {
+            tag pattern: "v\\d+\\.\\d+\\.\\d+", comparator: "REGEXP"
+        }
+        steps {
+            sh 'kubectl apply -f k8s/production/'
+        }
+    }
+}
+```
+
+**3. Post（后置操作）**
+```groovy
+post {
+    always {
+        junit 'target/surefire-reports/*.xml'  // 发布测试报告
+    }
+    success {
+        sh 'notify-success.sh'  // 发送成功通知
+    }
+    failure {
+        sh 'notify-failure.sh'  // 发送失败通知
+    }
+    unstable {
+        echo 'This pipeline is unstable!'
+    }
+    changed {
+        echo 'Pipeline status changed!'
+    }
+}
+```
+
+**4. Environment（环境变量）**
+```groovy
+environment {
+    MAVEN_HOME = '/opt/maven'
+    DATABASE_URL = credentials('database-url')  // 从 Jenkins 凭证获取
+    DEPLOY_ENV = 'staging'
+}
+
+pipeline {
+    agent any
+    environment {
+        APP_VERSION = sh(script: 'git describe --tags --always', returnStdout: true).trim()
+    }
+    stages {
+        stage('Build') {
+            steps {
+                sh "mvn clean package -Dapp.version=${APP_VERSION}"
+            }
+        }
+    }
+}
+```
+
+**5. Parameters（参数化构建）**
+```groovy
+pipeline {
+    agent any
+    parameters {
+        string(name: 'DEPLOY_ENV', defaultValue: 'staging', description: 'Deploy environment')
+        booleanParam(name: 'RUN_TESTS', defaultValue: true, description: 'Run tests')
+        choice(name: 'TIER', choices: ['dev', 'staging', 'production'], description: 'Environment tier')
+    }
+    stages {
+        stage('Deploy') {
+            when {
+                expression { params.DEPLOY_ENV == 'production' }
+            }
+            steps {
+                sh "kubectl apply -f k8s/${params.DEPLOY_ENV}/"
+            }
+        }
+    }
+}
+```
+
+---
+
+### 3. 完整 CI/CD 流程
+
+#### **流程图**：
+```
+开发者提交代码
+    ↓
+触发 CI/CD 流水线
+    ↓
+┌─────────────────────────────────────┐
+│ 1. 代码检查                          │
+│    - 代码风格检查 (ESLint, Checkstyle)│
+│    - 静态分析 (SonarQube)             │
+│    - 安全扫描 (Snyk, OWASP)           │
+└─────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────┐
+│ 2. 构建                              │
+│    - 编译 (Maven/Gradle/npm)          │
+│    - 打包 (JAR/WAR/Docker镜像)        │
+│    - 版本打标 (git tag)               │
+└─────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────┐
+│ 3. 测试                              │
+│    - 单元测试 (JUnit, pytest)         │
+│    - 集成测试 (TestContainers)        │
+│    - 代码覆盖率 (JaCoCo)              │
+└─────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────┐
+│ 4. 构建镜像                           │
+│    - Docker Build                    │
+│    - 推送到镜像仓库 (Docker Hub/ECR)  │
+└─────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────┐
+│ 5. 部署到 Staging 环境                │
+│    - Kubernetes Deploy               │
+│    - 数据库迁移 (Flyway/Liquibase)    │
+└─────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────┐
+│ 6. 自动化测试                         │
+│    - E2E 测试 (Selenium/Cypress)     │
+│    - 性能测试 (JMeter/Gatling)        │
+│    - 安全测试 (OWASP ZAP)             │
+└─────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────┐
+│ 7. 人工审核 (可选)                    │
+│    - 查看测试报告                     │
+│    - 审核变更内容                     │
+└─────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────┐
+│ 8. 部署到 Production 环境             │
+│    - 蓝绿部署/金丝雀发布               │
+│    - 监控验证                         │
+└─────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────┐
+│ 9. 发布通知                           │
+│    - 钉钉/企业微信/Slack               │
+│    - 发布日志                         │
+└─────────────────────────────────────┘
+```
+
+#### **Jenkinsfile 示例**：
+```groovy
+pipeline {
+    agent any
+
+    tools {
+        maven 'Maven 3.8.1'
+        jdk 'JDK 11'
+    }
+
+    environment {
+        IMAGE_NAME = "my-app"
+        IMAGE_TAG = "${env.BUILD_NUMBER}"
+        REGISTRY = "registry.example.com"
+        KUBECONFIG = credentials('kubeconfig')
+    }
+
+    stages {
+        stage('Checkout') {
+            steps {
+                git branch: 'main', url: 'https://github.com/myorg/myapp.git'
+            }
+        }
+
+        stage('Code Quality') {
+            steps {
+                sh 'mvn checkstyle:check'
+                sh 'mvn spotbugs:check'
+                script {
+                    def scannerHome = tool 'SonarQube Scanner';
+                    withSonarQubeEnv('MySonarQube') {
+                        sh "${scannerHome}/bin/sonar-scanner"
+                    }
+                }
+            }
+        }
+
+        stage('Build') {
+            steps {
+                sh 'mvn clean package -DskipTests'
+                archiveArtifacts artifacts: 'target/*.jar', fingerprint: true
+            }
+        }
+
+        stage('Unit Test') {
+            steps {
+                sh 'mvn test'
+                junit 'target/surefire-reports/*.xml'
+            }
+        }
+
+        stage('Integration Test') {
+            steps {
+                sh 'mvn verify -Pintegration-test'
+                junit 'target/failsafe-reports/*.xml'
+            }
+        }
+
+        stage('Build & Push Docker Image') {
+            steps {
+                script {
+                    docker.withRegistry("https://${REGISTRY}", 'docker-registry-credentials') {
+                        def image = docker.build("${IMAGE_NAME}:${IMAGE_TAG}")
+                        image.push()
+                        image.push('latest')
+                    }
+                }
+            }
+        }
+
+        stage('Deploy to Staging') {
+            steps {
+                sh """
+                    kubectl set image deployment/my-app \
+                        my-app=${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} \
+                        --namespace=staging
+                """
+                sh 'kubectl rollout status deployment/my-app --namespace=staging'
+            }
+        }
+
+        stage('E2E Test') {
+            steps {
+                sh 'mvn verify -Pe2e-test'
+            }
+        }
+
+        stage('Deploy to Production') {
+            when {
+                branch 'main'
+            }
+            steps {
+                input message: 'Deploy to Production?', ok: 'Deploy'
+                sh """
+                    kubectl set image deployment/my-app \
+                        my-app=${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} \
+                        --namespace=production
+                """
+                sh 'kubectl rollout status deployment/my-app --namespace=production'
+            }
+        }
+    }
+
+    post {
+        always {
+            cleanWs()
+        }
+        success {
+            sh 'notify-success.sh'
+        }
+        failure {
+            sh 'notify-failure.sh'
+        }
+    }
+}
+```
+
+---
+
+### 4. GitLab CI vs GitHub Actions
+
+#### **对比表**：
+
+| 特性 | GitLab CI | GitHub Actions |
+|------|-----------|----------------|
+| 集成度 | GitLab 内置 | GitHub 内置 |
+| 配置文件 | .gitlab-ci.yml | .github/workflows/*.yml |
+| Runner 类型 | Shared/Specific/Group | Hosted/Self-hosted |
+| 缓存 | artifacts/cache | actions/cache |
+| 密钥管理 | Variables/Secrets | Secrets/Environments |
+| 矩阵构建 | 支持 | 支持 |
+| 复用性 | Include/Template | Reusable Workflows |
+| 社区生态 | 丰富 | 快速增长 |
+
+#### **GitLab CI 示例**：
+```yaml
+# .gitlab-ci.yml
+stages:
+  - build
+  - test
+  - deploy
+
+variables:
+  MAVEN_OPTS: "-Dmaven.repo.local=$CI_PROJECT_DIR/.m2/repository"
+  IMAGE_NAME: registry.example.com/my-app
+  IMAGE_TAG: $CI_PIPELINE_ID
+
+cache:
+  paths:
+    - .m2/repository/
+
+# 构建
+build:
+  stage: build
+  image: maven:3.8.1-openjdk-11
+  script:
+    - mvn clean package -DskipTests
+  artifacts:
+    paths:
+      - target/*.jar
+    expire_in: 1 week
+
+# 单元测试
+unit-test:
+  stage: test
+  image: maven:3.8.1-openjdk-11
+  script:
+    - mvn test
+  artifacts:
+    reports:
+      junit: target/surefire-reports/*.xml
+
+# 代码质量
+code-quality:
+  stage: test
+  image: sonarsource/sonar-scanner-cli
+  script:
+    - sonar-scanner
+  allow_failure: true
+
+# 构建镜像
+build-image:
+  stage: build
+  image: docker:20.10.7
+  services:
+    - docker:20.10.7-dind
+  before_script:
+    - docker login -u $REGISTRY_USER -p $REGISTRY_PASSWORD registry.example.com
+  script:
+    - docker build -t $IMAGE_NAME:$IMAGE_TAG .
+    - docker push $IMAGE_NAME:$IMAGE_TAG
+    - docker tag $IMAGE_NAME:$IMAGE_TAG $IMAGE_NAME:latest
+    - docker push $IMAGE_NAME:latest
+
+# 部署到 Staging
+deploy-staging:
+  stage: deploy
+  image: bitnami/kubectl:latest
+  script:
+    - kubectl set image deployment/my-app my-app=$IMAGE_NAME:$IMAGE_TAG --namespace=staging
+    - kubectl rollout status deployment/my-app --namespace=staging
+  environment:
+    name: staging
+    url: https://staging.example.com
+  only:
+    - develop
+
+# 部署到 Production
+deploy-production:
+  stage: deploy
+  image: bitnami/kubectl:latest
+  script:
+    - kubectl set image deployment/my-app my-app=$IMAGE_NAME:$IMAGE_TAG --namespace=production
+    - kubectl rollout status deployment/my-app --namespace=production
+  environment:
+    name: production
+    url: https://example.com
+  when: manual  # 手动触发
+  only:
+    - main
+```
+
+#### **GitHub Actions 示例**：
+```yaml
+# .github/workflows/ci-cd.yml
+name: CI/CD Pipeline
+
+on:
+  push:
+    branches: [ main, develop ]
+  pull_request:
+    branches: [ main ]
+  release:
+    types: [ created ]
+
+env:
+  REGISTRY: registry.example.com
+  IMAGE_NAME: my-app
+
+jobs:
+  # 并行运行
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up JDK 11
+      uses: actions/setup-java@v3
+      with:
+        java-version: '11'
+        distribution: 'temurin'
+        cache: maven
+
+    - name: Build with Maven
+      run: mvn clean package -DskipTests
+
+    - name: Run Unit Tests
+      run: mvn test
+
+    - name: Upload Test Results
+      if: always()
+      uses: actions/upload-artifact@v3
+      with:
+        name: test-results
+        path: target/surefire-reports/*.xml
+
+    - name: Build Docker Image
+      run: |
+        docker build -t ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.run_number }} .
+        docker tag ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.run_number }} ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+
+    - name: Login to Registry
+      uses: docker/login-action@v2
+      with:
+        registry: ${{ env.REGISTRY }}
+        username: ${{ secrets.REGISTRY_USERNAME }}
+        password: ${{ secrets.REGISTRY_PASSWORD }}
+
+    - name: Push Docker Image
+      run: |
+        docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.run_number }}
+        docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+
+  # 依赖 build-and-test
+  deploy-staging:
+    needs: build-and-test
+    runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/develop'
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up kubectl
+      uses: azure/setup-kubectl@v3
+
+    - name: Deploy to Staging
+      run: |
+        kubectl set image deployment/my-app my-app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.run_number }} --namespace=staging
+        kubectl rollout status deployment/my-app --namespace=staging
+
+  deploy-production:
+    needs: build-and-test
+    runs-on: ubuntu-latest
+    if: github.event_name == 'release'
+    environment:
+      name: production
+      url: https://example.com
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up kubectl
+      uses: azure/setup-kubectl@v3
+
+    - name: Deploy to Production
+      run: |
+        kubectl set image deployment/my-app my-app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.run_number }} --namespace=production
+        kubectl rollout status deployment/my-app --namespace=production
+
+    - name: Notify
+      run: |
+        curl -X POST ${{ secrets.SLACK_WEBHOOK }} \
+          -H 'Content-Type: application/json' \
+          -d '{"text":"Deployment to Production successful!"}'
+```
+
+---
+
+### 5. 蓝绿部署和金丝雀发布
+
+#### **蓝绿部署（Blue-Green Deployment）**：
+```
+步骤：
+1. 部署新版本到 Green 环境
+2. 验证 Green 环境（健康检查、E2E 测试）
+3. 切换流量到 Green 环境
+4. 保留 Blue 环境，以便快速回滚
+
+优势：
+- 零停机部署
+- 快速回滚（切换回 Blue）
+- 风险低
+
+劣势：
+- 需要双倍资源
+- 数据库迁移需要特殊处理
+```
+
+**Kubernetes 实现**：
+```yaml
+# 1. 部署 Blue（当前版本）
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app-blue
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: my-app
+      version: blue
+  template:
+    metadata:
+      labels:
+        app: my-app
+        version: blue
+    spec:
+      containers:
+      - name: my-app
+        image: my-app:1.0.0
+
+---
+# 2. 部署 Green（新版本）
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app-green
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: my-app
+      version: green
+  template:
+    metadata:
+      labels:
+        app: my-app
+        version: green
+    spec:
+      containers:
+      - name: my-app
+        image: my-app:2.0.0
+
+---
+# 3. Service 指向 Blue
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-app
+spec:
+  selector:
+    app: my-app
+    version: blue  # 当前指向 Blue
+  ports:
+  - port: 80
+    targetPort: 8080
+
+# 切换到 Green：修改 selector.version = green
+```
+
+**Jenkins Pipeline**：
+```groovy
+stage('Blue-Green Deployment') {
+    steps {
+        script {
+            // 1. 部署 Green
+            sh "kubectl apply -f k8s/green-deployment.yaml"
+
+            // 2. 等待 Green Ready
+            sh 'kubectl rollout status deployment/my-app-green'
+
+            // 3. 健康检查
+            sh """
+                for i in {1..30}; do
+                    curl -f http://my-app-green.default.svc.cluster.local/health && break || sleep 5
+                done
+            """
+
+            // 4. 切换流量到 Green
+            sh "kubectl patch service my-app -p '{\"spec\":{\"selector\":{\"version\":\"green\"}}}'"
+
+            // 5. 监控 Green（如果失败，切换回 Blue）
+            timeout(time: 5, unit: 'MINUTES') {
+                input message: 'Verify Green environment. OK to proceed?', ok: 'Keep Green'
+            }
+        }
+    }
+    post {
+        failure {
+            // 回滚到 Blue
+            sh "kubectl patch service my-app -p '{\"spec\":{\"selector\":{\"version\":\"blue\"}}}'"
+        }
+    }
+}
+```
+
+#### **金丝雀发布（Canary Deployment）**：
+```
+步骤：
+1. 部署新版本到小部分实例（如 10%）
+2. 观察错误率、延迟等指标
+3. 逐步增加流量（10% → 50% → 100%）
+4. 如果出现问题，立即回滚
+
+优势：
+- 风险可控
+- 渐进式发布
+- 可以快速发现问题
+
+劣势：
+- 需要流量管理（如 Istio）
+- 监控要求高
+```
+
+**Istio 实现**：
+```yaml
+# 1. 部署 v1 和 v2
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app-v1
+spec:
+  replicas: 9
+  template:
+    metadata:
+      labels:
+        app: my-app
+        version: v1
+    spec:
+      containers:
+      - name: my-app
+        image: my-app:1.0.0
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app-v2
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: my-app
+        version: v2
+    spec:
+      containers:
+      - name: my-app
+        image: my-app:2.0.0
+
+---
+# 2. VirtualService 配置金丝雀
+apiVersion: networking.istio.io/v1beta1
+kind: VirtualService
+metadata:
+  name: my-app
+spec:
+  hosts:
+  - my-app
+  http:
+  - route:
+    - destination:
+        host: my-app
+        subset: v1
+      weight: 90  # 90% 流量到 v1
+    - destination:
+        host: my-app
+        subset: v2
+      weight: 10  # 10% 流量到 v2
+
+---
+# 3. DestinationRule 定义 subset
+apiVersion: networking.istio.io/v1beta1
+kind: DestinationRule
+metadata:
+  name: my-app
+spec:
+  host: my-app
+  subsets:
+  - name: v1
+    labels:
+      version: v1
+  - name: v2
+    labels:
+      version: v2
+```
+
+**Jenkins Pipeline**：
+```groovy
+stage('Canary Deployment') {
+    steps {
+        script {
+            // 1. 部署 v2（1 个副本）
+            sh "kubectl apply -f k8s/v2-deployment.yaml"
+
+            // 2. 配置 10% 流量到 v2
+            sh "kubectl apply -f istio/10-percent-canary.yaml"
+
+            // 3. 监控 5 分钟
+            sleep(time: 5, unit: 'MINUTES')
+
+            // 4. 检查错误率
+            def errorRate = sh(
+                script: 'curl -s http://prometheus/api/v1/query?query=rate(requests_total{status=~"5.."}[5m]) | jq .data.result[0].value[1]',
+                returnStdout: true
+            ).trim()
+
+            if (errorRate.toDouble() > 0.01) {
+                error "Error rate too high: ${errorRate}"
+            }
+
+            // 5. 逐步增加流量
+            sh "kubectl apply -f istio/50-percent-canary.yaml"
+            sleep(time: 5, unit: 'MINUTES')
+
+            // 6. 100% 流量到 v2
+            sh "kubectl apply -f istio/100-percent-canary.yaml"
+        }
+    }
+    post {
+        failure {
+            // 回滚到 v1
+            sh "kubectl apply -f istio/rollback-to-v1.yaml"
+        }
+    }
+}
+```
+
+---
+
+### 6. 环境变量和密钥管理
+
+#### **Jenkins 凭证管理**：
+```groovy
+// 1. 使用 Jenkins 凭证
+withCredentials([
+    string(credentialsId: 'database-url', variable: 'DATABASE_URL'),
+    usernamePassword(credentialsId: 'docker-registry', usernameVariable: 'REGISTRY_USER', passwordVariable: 'REGISTRY_PASSWORD')
+]) {
+    sh """
+        docker login -u $REGISTRY_USER -p $REGISTRY_PASSWORD registry.example.com
+        docker build -t myapp:${BUILD_NUMBER} --build-arg DATABASE_URL=$DATABASE_URL .
+    """
+}
+
+// 2. 使用 Secret File
+withCredentials([file(credentialsId: 'kubeconfig', variable: 'KUBECONFIG')]) {
+    sh 'kubectl --kubeconfig=$KUBECONFIG get pods'
+}
+
+// 3. 使用 Secret Text
+withCredentials([string(credentialsId: 'slack-webhook', variable: 'SLACK_WEBHOOK')]) {
+    sh """
+        curl -X POST $SLACK_WEBHOOK \
+          -H 'Content-Type: application/json' \
+          -d '{"text":"Build successful!"}'
+    """
+}
+```
+
+#### **GitLab CI 密钥管理**：
+```yaml
+# 在 GitLab UI 中设置 CI/CD Variables
+variables:
+  # 普通变量
+  DEPLOY_ENV: production
+
+# 使用 Masked Variables（隐藏变量）
+build:
+  script:
+    - docker login -u $REGISTRY_USER -p $REGISTRY_PASSWORD
+    - echo $DATABASE_URL  # 在日志中会被隐藏为 ****
+
+# 使用 File 类型变量（自动保存为文件）
+deploy:
+  script:
+    - kubectl --kubeconfig=$KUBECONFIG get pods
+```
+
+#### **GitHub Actions 密钥管理**：
+```yaml
+# 在 GitHub UI 中设置 Secrets
+env:
+  DATABASE_URL: ${{ secrets.DATABASE_URL }}
+
+steps:
+- name: Login to Registry
+  uses: docker/login-action@v2
+  with:
+    registry: registry.example.com
+    username: ${{ secrets.REGISTRY_USERNAME }}
+    password: ${{ secrets.REGISTRY_PASSWORD }}
+
+- name: Deploy
+  run: |
+    kubectl --kubeconfig <(echo "${{ secrets.KUBECONFIG }}") get pods
+```
+
+#### **Kubernetes Secrets**：
+```yaml
+# 1. 创建 Secret
+apiVersion: v1
+kind: Secret
+metadata:
+  name: app-secrets
+type: Opaque
+data:
+  database-url: BASE64_ENCODED_URL
+  api-key: BASE64_ENCODED_KEY
+
+# 2. 在 Pod 中使用
+apiVersion: v1
+kind: Pod
+spec:
+  containers:
+  - name: app
+    image: my-app
+    env:
+    - name: DATABASE_URL
+      valueFrom:
+        secretKeyRef:
+          name: app-secrets
+          key: database-url
+    - name: API_KEY
+      valueFrom:
+        secretKeyRef:
+          name: app-secrets
+          key: api-key
+```
+
+---
+
+### 7. 基础设施即代码（IaC）
+
+#### **Terraform 示例**：
+```hcl
+# main.tf
+
+provider "aws" {
+  region = "us-west-2"
+}
+
+# VPC
+resource "aws_vpc" "main" {
+  cidr_block           = "10.0.0.0/16"
+  enable_dns_hostnames = true
+  enable_dns_support   = true
+
+  tags = {
+    Name = "main-vpc"
+  }
+}
+
+# EKS Cluster
+resource "aws_eks_cluster" "main" {
+  name     = "main-cluster"
+  role_arn = aws_iam_role.eks_cluster.arn
+
+  vpc_config {
+    subnet_ids = aws_subnet.private[*].id
+  }
+
+  depends_on = [aws_iam_role_policy_attachment.eks_cluster_policy]
+}
+
+# Node Group
+resource "aws_eks_node_group" "main" {
+  cluster_name    = aws_eks_cluster.main.name
+  node_group_name = "main-node-group"
+  node_role_arn   = aws_iam_role.eks_nodes.arn
+  subnet_ids      = aws_subnet.private[*].id
+
+  scaling_config {
+    desired_size = 3
+    max_size     = 5
+    min_size     = 1
+  }
+
+  instance_types = ["t3.medium"]
+
+  depends_on = [aws_iam_role_policy_attachment.eks_nodes_policy]
+}
+
+# RDS Database
+resource "aws_db_instance" "main" {
+  identifier           = "main-db"
+  engine              = "mysql"
+  engine_version      = "8.0"
+  instance_class      = "db.t3.micro"
+  allocated_storage   = 20
+  storage_encrypted   = true
+
+  db_name  = "mydb"
+  username = var.db_username
+  password = var.db_password
+
+  vpc_security_group_ids = [aws_security_group.db.id]
+  db_subnet_group_name   = aws_db_subnet_group.main.name
+
+  backup_retention_period = 7
+  skip_final_snapshot    = false
+  final_snapshot_identifier = "main-db-final-snapshot"
+}
+
+# Output
+output "cluster_endpoint" {
+  value = aws_eks_cluster.main.endpoint
+}
+
+output "db_endpoint" {
+  value     = aws_db_instance.main.endpoint
+  sensitive = true
+}
+```
+
+**在 CI/CD 中使用 Terraform**：
+```groovy
+stage('Terraform Apply') {
+    steps {
+        withCredentials([
+            string(credentialsId: 'aws-access-key-id', variable: 'AWS_ACCESS_KEY_ID'),
+            string(credentialsId: 'aws-secret-access-key', variable: 'AWS_SECRET_ACCESS_KEY')
+        ]) {
+            script {
+                dir('terraform') {
+                    // 1. 初始化
+                    sh 'terraform init'
+
+                    // 2. 格式检查
+                    sh 'terraform fmt -check'
+
+                    // 3. 验证
+                    sh 'terraform validate'
+
+                    // 4. Plan
+                    def plan = sh(
+                        script: 'terraform plan -out=tfplan',
+                        returnStdout: true
+                    )
+
+                    // 5. 人工审核
+                    input message: "Review Terraform Plan:\n${plan}", ok: 'Apply'
+
+                    // 6. Apply
+                    sh 'terraform apply tfplan'
+                }
+            }
+        }
+    }
+}
+```
+
+---
+
+### 8. 集成测试
+
+#### **Pipeline 测试阶段**：
+```groovy
+stage('Test') {
+    parallel {
+        stage('Unit Test') {
+            steps {
+                sh 'mvn test'
+                junit 'target/surefire-reports/*.xml'
+            }
+        }
+        stage('Integration Test') {
+            steps {
+                sh 'mvn verify -Pintegration-test'
+                junit 'target/failsafe-reports/*.xml'
+            }
+        }
+        stage('Code Coverage') {
+            steps {
+                sh 'mvn jacoco:report'
+                jacoco execPattern: 'target/jacoco.exec', classPattern: 'target/classes', sourcePattern: 'src/main/java'
+            }
+        }
+    }
+}
+
+stage('E2E Test') {
+    steps {
+        sh 'mvn verify -Pe2e-test'
+        publishHTML([
+            reportDir: 'target/cypress-report',
+            reportFiles: 'index.html',
+            reportName: 'E2E Test Report'
+        ])
+    }
+}
+```
+
+#### **TestContainers 示例**：
+```java
+@SpringBootTest
+@Testcontainers
+public class UserServiceIntegrationTest {
+
+    @Container
+    private static final PostgreSQL<?> postgres = new PostgreSQL<>("postgres:13");
+
+    @Container
+    private static final GenericContainer<?> redis = new GenericContainer<>("redis:6")
+        .withExposedPorts(6379);
+
+    @DynamicPropertySource
+    static void configureProperties(DynamicPropertyRegistry registry) {
+        registry.add("spring.datasource.url", postgres::getJdbcUrl);
+        registry.add("spring.datasource.username", postgres::getUsername);
+        registry.add("spring.datasource.password", postgres::getPassword);
+        registry.add("spring.redis.host", redis::getHost);
+        registry.add("spring.redis.port", () -> redis.getFirstMappedPort());
+    }
+
+    @Test
+    void shouldCreateUser() {
+        User user = new User("Alice", "alice@example.com");
+        User saved = userService.save(user);
+        assertNotNull(saved.getId());
+    }
+}
+```
+
+---
+
+### 9. 回滚失败的部署
+
+#### **Kubernetes 回滚**：
+```bash
+# 1. 查看历史版本
+kubectl rollout history deployment/my-app
+
+# 2. 回滚到上一版本
+kubectl rollout undo deployment/my-app
+
+# 3. 回滚到指定版本
+kubectl rollout undo deployment/my-app --to-revision=3
+
+# 4. 暂停部署（出现问题）
+kubectl rollout pause deployment/my-app
+
+# 5. 恢复部署
+kubectl rollout resume deployment/my-app
+```
+
+#### **Jenkins Pipeline 回滚**：
+```groovy
+stage('Deploy') {
+    steps {
+        script {
+            // 1. 记录当前版本
+            def currentVersion = sh(
+                script: 'kubectl get deployment my-app -o jsonpath="{.spec.template.spec.containers[0].image}"',
+                returnStdout: true
+            ).trim()
+
+            // 2. 部署新版本
+            sh "kubectl set image deployment/my-app my-app=${IMAGE}:${TAG}"
+
+            // 3. 等待部署完成
+            timeout(time: 5, unit: 'MINUTES') {
+                sh 'kubectl rollout status deployment/my-app'
+            }
+
+            // 4. 健康检查
+            sh """
+                for i in {1..30}; do
+                    curl -f http://my-app.default.svc.cluster.local/health && break || sleep 5
+                done
+            """
+
+            // 5. 监控
+            sleep(time: 2, unit: 'MINUTES')
+
+            // 6. 检查错误率
+            def errorRate = sh(
+                script: 'curl -s http://prometheus/api/v1/query?query=rate(requests_total{status=~"5.."}[5m]) | jq .data.result[0].value[1]',
+                returnStdout: true
+            ).trim()
+
+            if (errorRate.toDouble() > 0.05) {
+                error "Error rate too high: ${errorRate}, rolling back..."
+            }
+        }
+    }
+    post {
+        failure {
+            script {
+                // 回滚到上一版本
+                sh 'kubectl rollout undo deployment/my-app'
+                echo 'Rolled back to previous version'
+            }
+        }
+    }
+}
+```
+
+---
+
+### 10. 实际项目经验
+
+#### **场景 1：电商系统 CI/CD 流水线**
+```
+需求：
+- 代码提交后自动构建、测试
+- 通过测试后自动部署到 Staging
+- Staging 通过 E2E 测试后，手动部署到 Production
+- 生产环境支持蓝绿部署
+
+方案：
+1. GitHub Actions 构建 Docker 镜像
+2. 推送到私有镜像仓库
+3. 部署到 Kubernetes Staging 环境
+4. 运行 Cypress E2E 测试
+5. 人工审核后部署到 Production
+6. 使用 Istio 实现蓝绿切换
+```
+
+#### **场景 2：数据库迁移自动化**
+```yaml
+# 使用 Flyway 自动迁移数据库
+migrate:
+  stage: migrate
+  image: flyway/flyway:7
+  script:
+    - flyway migrate -url=$DATABASE_URL -user=$DATABASE_USER -password=$DATABASE_PASSWORD
+  only:
+    - main
+```
+
+#### **场景 3：多环境配置管理**
+```yaml
+# 使用 Helm Charts 实现多环境部署
+deploy-staging:
+  script:
+    - helm upgrade --install my-app ./helm-chart --namespace staging --values helm-chart/values-staging.yaml
+
+deploy-production:
+  script:
+    - helm upgrade --install my-app ./helm-chart --namespace production --values helm-chart/values-production.yaml
+```
+
+---
+
+### 11. 阿里 P7 加分项
+
+**架构设计能力**：
+- 设计过企业级 CI/CD 平台（支持多语言、多环境）
+- 实现过自建 Runner 集群（Kubernetes Executors）
+- 有多租户、多团队的 CI/CD 隔离经验
+
+**深度理解**：
+- 熟悉 Jenkins/GitLab CI 源码和插件开发
+- 理解分布式缓存和构建加速（Build Cache、Docker Layer Cache）
+- 有 GitOps 实践经验（ArgoCD、Flux）
+
+**性能优化**：
+- 优化过构建时间（并行构建、增量构建、缓存策略）
+- 实现过分布式构建（Build Farm）
+- 优化过 Runner 资源利用率（动态扩缩容）
+
+**安全实践**：
+- 实现过 CI/CD 安全扫描（SAST、DAST、依赖扫描）
+- 有签名和验证经验（容器镜像签名、Commit Signing）
+- 实现过密钥轮换和凭证管理
+
+**监控和可观测性**：
+- 集成过 CI/CD 监控（构建成功率、构建时间、部署频率）
+- 实现过部署追踪（Deployment Tracking、Change Log）
+- 设计过性能测试自动化（K6、JMeter 集成）
+
+**DevSecOps**：
+- 实现过安全左移（Pre-commit Hooks、PR Check）
+- 集成过合规性检查（PCI-DSS、GDPR）
+- 实现过供应链安全（SBOM、漏洞扫描）
diff --git a/questions/container-orchestration.md b/questions/container-orchestration.md
new file mode 100644
index 0000000..d78d3ff
--- /dev/null
+++ b/questions/container-orchestration.md
@@ -0,0 +1,1020 @@
+# 容器编排 (Kubernetes)
+
+## 问题
+
+**背景**：随着容器化技术的普及，容器编排成为管理大规模容器集群的关键。Kubernetes 作为事实上的标准，提供了自动化部署、扩展和管理容器化应用的能力。
+
+**问题**：
+1. 什么是容器编排？为什么需要 Kubernetes？
+2. Kubernetes 的核心架构有哪些组件？
+3. Pod、Deployment、Service 的关系是什么？
+4. 请描述 Kubernetes 的网络模型
+5. Kubernetes 如何实现服务发现和负载均衡？
+6. 什么是 ConfigMap 和 Secret？如何使用？
+7. Kubernetes 的存储卷（Volume）有哪些类型？
+8. 请描述 Kubernetes 的调度流程
+9. Ingress 是什么？它和 NodePort、LoadBalancer 的区别？
+10. 在生产环境中使用 Kubernetes 遇到过哪些坑？
+
+---
+
+## 标准答案
+
+### 1. 容器编排概述
+
+#### **为什么需要容器编排**：
+```
+单机 Docker 的痛点：
+├─ 容器生命周期管理复杂
+├─ 服务发现和负载均衡困难
+├─ 滚动更新和回滚复杂
+├─ 资源调度和利用率低
+├─ 高可用和故障自恢复难实现
+└─ 多主机网络配置复杂
+
+Kubernetes 的解决方案：
+├─ 自动化部署和回滚
+├─ 服务发现和负载均衡
+├─ 自我修复（失败重启、节点迁移）
+├─ 自动扩缩容（HPA）
+├─ 存储编排
+└─ 配置管理和密钥管理
+```
+
+---
+
+### 2. Kubernetes 核心架构
+
+#### **架构图**：
+```
+                    ┌─────────────────────────────────┐
+                    │         Control Plane           │
+                    │  (Master 节点)                   │
+                    └─────────────────────────────────┘
+                           │
+         ┌─────────────────┼─────────────────┐
+         │                 │                 │
+    ┌─────────┐      ┌──────────┐      ┌──────────┐
+    │API Server│      │Scheduler │      │Controller│
+    │ (apiserver)│    │ (调度器)   │      │  Manager │
+    └─────────┘      └──────────┘      └──────────┘
+         │                 │                 │
+    ┌─────────┐      ┌──────────┐      ┌──────────┐
+    │   etcd  │      │Cloud Ctl │      │   kube-  │
+    │ (存储)   │      │ Manager  │      │  proxy   │
+    └─────────┘      └──────────┘      └──────────┘
+         │
+         │ HTTP/REST API
+         │
+    ┌─────────────────────────────────────────────┐
+    │             Worker Nodes                    │
+    ├─────────────────────────────────────────────┤
+    │                                             │
+    │  Node 1              Node 2                │
+    │  ┌────────────┐    ┌────────────┐         │
+    │  │ kubelet    │    │ kubelet    │         │
+    │  │ (Pod 代理)  │    │            │         │
+    │  └────────────┘    └────────────┘         │
+    │  ┌────────────┐    ┌────────────┐         │
+    │  │kube-proxy  │    │kube-proxy  │         │
+    │  │ (网络代理)  │    │            │         │
+    │  └────────────┘    └────────────┘         │
+    │  ┌────────────┐    ┌────────────┐         │
+    │  │Container   │    │Container   │         │
+    │  │Runtime     │    │Runtime     │         │
+    │  │(Docker/...)│    │            │         │
+    │  └────────────┘    └────────────┘         │
+    │  ┌────────────┐    ┌────────────┐         │
+    │  │   Pods     │    │   Pods     │         │
+    │  │ ┌────────┐ │    │ ┌────────┐ │         │
+    │  │ │ App 1  │ │    │ │ App 2  │ │         │
+    │  │ └────────┘ │    │ └────────┘ │         │
+    │  └────────────┘    └────────────┘         │
+    │                                             │
+    └─────────────────────────────────────────────┘
+```
+
+#### **核心组件详解**：
+
+**1. API Server（ apiserver）**
+- Kubernetes 的入口，所有请求都通过 API Server
+- 认证、授权、准入控制
+- RESTful API
+
+**2. etcd**
+- 分布式键值存储
+- 存储集群所有状态数据
+- Watch 机制，推送变化
+
+**3. Scheduler（调度器）**
+- 负责决定 Pod 调度到哪个节点
+- 调度算法：资源需求、硬件约束、亲和性/反亲和性
+
+**4. Controller Manager**
+- 维护集群状态
+- 常见控制器：
+  - Node Controller：节点故障处理
+  - Replication Controller：副本管理
+  - Endpoint Controller：Service 端点管理
+
+**5. kubelet**
+- 运行在每个节点上
+- 负责 Pod 的生命周期管理
+- 上报节点状态
+
+**6. kube-proxy**
+- 维护网络规则
+- 实现 Service 负载均衡
+
+---
+
+### 3. Pod、Deployment、Service
+
+#### **关系图**：
+```
+Deployment (声明式部署)
+    │
+    ├── 管理 ReplicaSet (副本集)
+    │         │
+    │         └── 管理 Pod (最小调度单元)
+    │                   │
+    │                   ├── Container 1 (应用容器)
+    │                   ├── Container 2 (Sidecar)
+    │                   └── Shared Volume (共享存储)
+
+Service (服务发现)
+    │
+    ├── 通过 Label Selector 选择 Pod
+    │
+    └── 提供稳定的访问入口（IP/DNS）
+```
+
+#### **Pod 示例**：
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: nginx-pod
+  labels:
+    app: nginx
+    env: prod
+spec:
+  containers:
+  - name: nginx
+    image: nginx:1.21
+    ports:
+    - containerPort: 80
+    resources:
+      requests:
+        memory: "64Mi"
+        cpu: "250m"
+      limits:
+        memory: "128Mi"
+        cpu: "500m"
+  - name: sidecar
+    image: fluentd:1.12
+    volumeMounts:
+    - name: log-volume
+      mountPath: /var/log
+  volumes:
+  - name: log-volume
+    emptyDir: {}
+```
+
+#### **Deployment 示例**：
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: nginx-deployment
+  labels:
+    app: nginx
+spec:
+  replicas: 3  # 3 个副本
+  selector:
+    matchLabels:
+      app: nginx
+  template:  # Pod 模板
+    metadata:
+      labels:
+        app: nginx
+    spec:
+      containers:
+      - name: nginx
+        image: nginx:1.21
+        ports:
+        - containerPort: 80
+        livenessProbe:
+          httpGet:
+            path: /
+            port: 80
+          initialDelaySeconds: 3
+          periodSeconds: 3
+        readinessProbe:
+          httpGet:
+            path: /
+            port: 80
+          initialDelaySeconds: 3
+          periodSeconds: 3
+```
+
+#### **Service 示例**：
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: nginx-service
+spec:
+  selector:
+    app: nginx  # 选择 Pod
+  ports:
+  - protocol: TCP
+    port: 80        # Service 端口
+    targetPort: 80  # Pod 端口
+  type: ClusterIP   # 服务类型
+```
+
+**三种 Service 类型**：
+```yaml
+# 1. ClusterIP（默认）
+type: ClusterIP
+# 仅集群内部访问
+
+# 2. NodePort
+type: NodePort
+ports:
+- port: 80
+  targetPort: 80
+  nodePort: 30080  # 每个节点都暴露 30080 端口
+
+# 3. LoadBalancer
+type: LoadBalancer
+# 云服务商提供外部负载均衡器
+```
+
+---
+
+### 4. Kubernetes 网络模型
+
+#### **网络要求**：
+```
+1. 所有 Pod 可以不通过 NAT 直接通信
+2. 所有 Node 可以与所有 Pod 通信
+3. Pod 看到的自己 IP 和别人看到的 IP 一致
+```
+
+#### **网络架构**：
+```
+                    Internet
+                       │
+                       │
+                 ┌──────────┐
+                 │ Ingress  │
+                 └──────────┘
+                       │
+                Service (ClusterIP: 10.0.0.1)
+                       │
+        ┌──────────────┼──────────────┐
+        │              │              │
+    Pod (10.244.1.2) Pod (10.244.1.3) Pod (10.244.2.5)
+    Node 1          Node 1          Node 2
+```
+
+#### **网络插件（CNI）**：
+
+| 插件 | 类型 | 特点 |
+|------|------|------|
+| Flannel | VxLAN/Host-GW | 简单，性能一般 |
+| Calico | BGP | 性能好，支持网络策略 |
+| Cilium | eBPF | 高性能，支持透明代理 |
+| Weave | VxLAN | 简单，加密支持 |
+
+**Calico 示例**：
+```yaml
+# 安装 Calico
+kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
+
+# 网络策略
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: deny-from-other-namespaces
+  namespace: default
+spec:
+  podSelector:
+    matchLabels:
+      app: nginx
+  ingress:
+  - from:
+    - podSelector: {}
+```
+
+---
+
+### 5. 服务发现和负载均衡
+
+#### **服务发现**：
+```yaml
+# 1. 环境变量
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app
+spec:
+  containers:
+  - name: app
+    image: my-app
+    env:
+    - name: DB_SERVICE_HOST
+      value: "mysql-service"
+    - name: DB_SERVICE_PORT
+      value: "3306"
+
+# 2. DNS（推荐）
+# Pod 可以通过 DNS 名称访问 Service
+# mysql-service.default.svc.cluster.local
+```
+
+**Kubernetes DNS 架构**：
+```
+Pod 启动 → /etc/resolv.conf 配置
+    nameserver 10.96.0.10  # kube-dns 的 ClusterIP
+    search default.svc.cluster.local svc.cluster.local cluster.local
+
+    ↓
+
+解析域名
+    mysql-service.default.svc.cluster.local
+    ↓
+    返回 Service ClusterIP (10.0.0.1)
+    ↓
+    kube-proxy 负载均衡到 Pod
+```
+
+#### **负载均衡策略**：
+
+**kube-proxy 三种模式**：
+```yaml
+# 1. Userspace（旧版，性能差）
+mode: userspace
+
+# 2. iptables（默认）
+mode: iptables
+# 使用 iptables 规则实现负载均衡
+
+# 3. ipvs（推荐）
+mode: ipvs
+# 使用 IPVS，性能更好
+```
+
+**Service 负载均衡算法**：
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: nginx-service
+spec:
+  sessionAffinity: ClientIP  # 会话保持
+  sessionAffinityConfig:
+    clientIP:
+      timeoutSeconds: 10800  # 3 小时
+```
+
+---
+
+### 6. ConfigMap 和 Secret
+
+#### **ConfigMap（配置管理）**：
+```yaml
+# 1. 创建 ConfigMap
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: app-config
+data:
+  application.properties: |
+    server.port=8080
+    spring.datasource.url=jdbc:mysql://localhost:3306/db
+  log-level: "info"
+  feature-flags: |
+    featureA=true
+    featureB=false
+
+# 2. 使用 ConfigMap
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app
+spec:
+  containers:
+  - name: app
+    image: my-app
+    env:
+    - name: LOG_LEVEL
+      valueFrom:
+        configMapKeyRef:
+          name: app-config
+          key: log-level
+    volumeMounts:
+    - name: config-volume
+      mountPath: /etc/config
+  volumes:
+  - name: config-volume
+    configMap:
+      name: app-config
+```
+
+#### **Secret（密钥管理）**：
+```yaml
+# 1. 创建 Secret
+apiVersion: v1
+kind: Secret
+metadata:
+  name: db-secret
+type: Opaque
+data:
+  username: YWRtaW4=  # base64 编码
+  password: MWYyZDFlMmU2N2Rm
+
+# 2. 使用 Secret
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app
+spec:
+  containers:
+  - name: app
+    image: my-app
+    env:
+    - name: DB_USERNAME
+      valueFrom:
+        secretKeyRef:
+          name: db-secret
+          key: username
+    - name: DB_PASSWORD
+      valueFrom:
+        secretKeyRef:
+          name: db-secret
+          key: password
+```
+
+**从文件创建 Secret**：
+```bash
+# 创建 TLS Secret
+kubectl create secret tls my-tls-secret \
+  --cert=path/to/cert.crt \
+  --key=path/to/cert.key
+
+# 创建 Docker Registry Secret
+kubectl create secret docker-registry my-registry-secret \
+  --docker-server=registry.example.com \
+  --docker-username=user \
+  --docker-password=password
+```
+
+---
+
+### 7. 存储卷（Volume）
+
+#### **常见 Volume 类型**：
+
+| 类型 | 说明 | 适用场景 |
+|------|------|----------|
+| emptyDir | 临时目录，Pod 删除后数据丢失 | 临时缓存 |
+| hostPath | 主机路径，Pod 删除后数据保留 | 日志收集、监控 |
+| PersistentVolumeClaim | 持久化存储 | 数据库、应用数据 |
+| ConfigMap | 配置文件 | 应用配置 |
+| Secret | 敏感数据 | 密钥、证书 |
+
+#### **PV/PVC 示例**：
+```yaml
+# 1. PersistentVolume (PV)
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-example
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+  - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  nfs:
+    server: 192.168.1.100
+    path: /data/nfs
+
+# 2. PersistentVolumeClaim (PVC)
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: pvc-example
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: 5Gi
+
+# 3. 使用 PVC
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app
+spec:
+  containers:
+  - name: app
+    image: my-app
+    volumeMounts:
+    - name: data-volume
+      mountPath: /data
+  volumes:
+  - name: data-volume
+    persistentVolumeClaim:
+      claimName: pvc-example
+```
+
+**StorageClass（动态存储分配）**：
+```yaml
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: fast-ssd
+provisioner: kubernetes.io/aws-ebs
+parameters:
+  type: gp2
+  iopsPerGB: "10"
+reclaimPolicy: Delete
+volumeBindingMode: Immediate
+```
+
+---
+
+### 8. Kubernetes 调度流程
+
+#### **调度流程图**：
+```
+1. Pod 创建
+   ↓
+2. API Server 接收请求，写入 etcd
+   ↓
+3. Scheduler 监听到未调度的 Pod
+   ↓
+4. 预选（Predicate）：过滤掉不符合条件的节点
+   - 资源是否足够（CPU、内存）
+   - 节点选择器（nodeSelector）
+   - 亲和性/反亲和性
+   - Taints 和 Tolerations
+   ↓
+5. 优选（Priority）：给符合条件的节点打分
+   - 资源利用率
+   - 镜像本地缓存
+   - Pod 分散性
+   ↓
+6. 选择得分最高的节点
+   ↓
+7. 绑定（Binding）：将 Pod 绑定到节点
+   ↓
+8. API Server 更新 Pod 状态
+   ↓
+9. kubelet 监听到 Pod 分配到自己，启动容器
+```
+
+#### **调度约束示例**：
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app
+spec:
+  # 1. nodeSelector（节点选择器）
+  nodeSelector:
+    disktype: ssd
+
+  # 2. Node Affinity（节点亲和性）
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          - key: disktype
+            operator: In
+            values:
+            - ssd
+      preferredDuringSchedulingIgnoredDuringExecution:
+      - weight: 100
+        preference:
+          matchExpressions:
+          - key: zone
+            operator: In
+            values:
+            - cn-shanghai-a
+
+  # 3. Pod Affinity（Pod 亲和性）
+  affinity:
+    podAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+      - labelSelector:
+          matchExpressions:
+          - key: app
+            operator: In
+            values:
+            - nginx
+        topologyKey: kubernetes.io/hostname
+
+  # 4. Tolerations（容忍度）
+  tolerations:
+  - key: "dedicated"
+    operator: "Equal"
+    value: "gpu"
+    effect: "NoSchedule"
+```
+
+---
+
+### 9. Ingress vs NodePort vs LoadBalancer
+
+#### **对比表**：
+
+| 类型 | 适用场景 | 优点 | 缺点 |
+|------|----------|------|------|
+| NodePort | 测试、开发 | 简单 | 端口管理复杂、性能一般 |
+| LoadBalancer | 生产环境（云服务商） | 自动负载均衡 | 成本高、依赖云厂商 |
+| Ingress | 生产环境（推荐） | 灵活、支持 7 层路由 | 配置复杂 |
+
+#### **NodePort 示例**：
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: nginx-nodeport
+spec:
+  type: NodePort
+  ports:
+  - port: 80
+    targetPort: 80
+    nodePort: 30080  # 30000-32767
+  selector:
+    app: nginx
+```
+
+#### **LoadBalancer 示例**：
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: nginx-lb
+spec:
+  type: LoadBalancer
+  ports:
+  - port: 80
+    targetPort: 80
+  selector:
+    app: nginx
+```
+
+#### **Ingress 示例**：
+```yaml
+# 1. 安装 Ingress Controller（如 Nginx）
+kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.1.0/deploy/static/provider/cloud/deploy.yaml
+
+# 2. 创建 Ingress
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: nginx-ingress
+  annotations:
+    nginx.ingress.kubernetes.io/rewrite-target: /
+spec:
+  rules:
+  - host: example.com  # 域名
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: nginx-service
+            port:
+              number: 80
+  - host: api.example.com
+    http:
+      paths:
+      - path: /v1
+        pathType: Prefix
+        backend:
+          service:
+            name: api-service
+            port:
+              number: 8080
+```
+
+**Ingress 高级配置**：
+```yaml
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: nginx-ingress
+  annotations:
+    # TLS
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    # 限流
+    nginx.ingress.kubernetes.io/limit-rps: "10"
+    # 超时
+    nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
+    nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
+spec:
+  tls:
+  - hosts:
+    - example.com
+    secretName: example-tls
+  rules:
+  - host: example.com
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: nginx-service
+            port:
+              number: 80
+```
+
+---
+
+### 10. 生产环境踩坑经验
+
+#### **坑 1：Pod 无法启动（ImagePullBackOff）**
+```bash
+# 问题：镜像拉取失败
+kubectl get pods
+NAME                    READY   STATUS              RESTARTS   AGE
+nginx-pod               0/1     ImagePullBackOff    0          2m
+
+# 排查
+kubectl describe pod nginx-pod
+# Events: Failed to pull image "nginx:latest": rpc error: code = Unknown
+
+# 解决
+# 1. 检查镜像名称和标签
+# 2. 检查私有仓库凭证
+kubectl create secret docker-registry my-registry-secret \
+  --docker-server=registry.example.com \
+  --docker-username=user \
+  --docker-password=password
+
+# 3. 在 Pod 中引用 Secret
+spec:
+  imagePullSecrets:
+  - name: my-registry-secret
+```
+
+#### **坑 2：CrashLoopBackOff**
+```bash
+# 问题：Pod 不断重启
+kubectl get pods
+NAME                    READY   STATUS                   RESTARTS   AGE
+nginx-pod               0/1     CrashLoopBackOff         5          10m
+
+# 排查
+kubectl logs nginx-pod
+# Error: Cannot connect to database
+
+# 解决
+# 1. 检查应用日志
+# 2. 检查配置文件
+# 3. 检查依赖服务（数据库、Redis）
+kubectl describe pod nginx-pod
+# 检查 Events
+```
+
+#### **坑 3：资源限制设置不当**
+```yaml
+# 问题：Pod 被杀（OOMKilled）
+# 原因：内存限制太小
+
+# 解决
+resources:
+  requests:
+    memory: "256Mi"   # 保证最小内存
+    cpu: "500m"       # 保证最小 CPU
+  limits:
+    memory: "512Mi"   # 最大内存
+    cpu: "1000m"      # 最大 CPU
+
+# 监控资源使用
+kubectl top pods
+kubectl top nodes
+```
+
+#### **坑 4：滚动更新失败**
+```bash
+# 问题：更新后，所有 Pod 都不可用
+kubectl rollout status deployment/nginx-deployment
+# Waiting for deployment "nginx-deployment" to progress
+
+# 解决
+# 1. 回滚到上一版本
+kubectl rollout undo deployment/nginx-deployment
+
+# 2. 查看历史版本
+kubectl rollout history deployment/nginx-deployment
+
+# 3. 设置健康检查
+livenessProbe:
+  httpGet:
+    path: /health
+    port: 8080
+  initialDelaySeconds: 30
+  periodSeconds: 10
+readinessProbe:
+  httpGet:
+    path: /ready
+    port: 8080
+  initialDelaySeconds: 5
+  periodSeconds: 5
+```
+
+#### **坑 5：DNS 解析失败**
+```bash
+# 问题：Pod 无法访问 Service
+curl http://nginx-service.default.svc.cluster.local
+# curl: (6) Could not resolve host
+
+# 排查
+kubectl exec -it my-app -- cat /etc/resolv.conf
+# nameserver 10.96.0.10
+
+# 解决
+# 1. 检查 kube-dns/CoreDNS 是否运行
+kubectl get pods -n kube-system
+
+# 2. 检查 DNS 配置
+kubectl get configmap coredns -n kube-system -o yaml
+
+# 3. 重启 DNS
+kubectl rollout restart deployment/coredns -n kube-system
+```
+
+---
+
+### 11. 实际项目经验
+
+#### **场景 1：高可用部署**
+```yaml
+# 需求：保证服务高可用
+# 方案：
+# 1. 多副本
+replicas: 3
+
+# 2. Pod 反亲和性（分散到不同节点）
+affinity:
+  podAntiAffinity:
+    preferredDuringSchedulingIgnoredDuringExecution:
+    - weight: 100
+      podAffinityTerm:
+        labelSelector:
+          matchExpressions:
+          - key: app
+            operator: In
+            values:
+            - nginx
+        topologyKey: kubernetes.io/hostname
+
+# 3. 健康检查
+livenessProbe:
+  httpGet:
+    path: /health
+    port: 8080
+  initialDelaySeconds: 30
+  periodSeconds: 10
+readinessProbe:
+  httpGet:
+    path: /ready
+    port: 8080
+  initialDelaySeconds: 5
+  periodSeconds: 5
+
+# 4. 资源限制
+resources:
+  requests:
+    memory: "256Mi"
+    cpu: "500m"
+  limits:
+    memory: "512Mi"
+    cpu: "1000m"
+```
+
+#### **场景 2：自动扩缩容（HPA）**
+```yaml
+# 1. 安装 Metrics Server
+kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
+
+# 2. 创建 HPA
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: nginx-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: nginx-deployment
+  minReplicas: 2
+  maxReplicas: 10
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      target:
+        type: Utilization
+        averageUtilization: 50  # CPU 使用率超过 50% 时扩容
+  - type: Resource
+    resource:
+      name: memory
+      target:
+        type: Utilization
+        averageUtilization: 80  # 内存使用率超过 80% 时扩容
+```
+
+#### **场景 3：配置管理**
+```yaml
+# 需求：不同环境使用不同配置
+# 方案：使用 ConfigMap
+
+# 开发环境
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: app-config-dev
+  namespace: dev
+data:
+  spring.profiles.active: "dev"
+  spring.datasource.url: "jdbc:mysql://dev-mysql:3306/db"
+
+# 生产环境
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: app-config-prod
+  namespace: prod
+data:
+  spring.profiles.active: "prod"
+  spring.datasource.url: "jdbc:mysql://prod-mysql:3306/db"
+
+# Pod 使用
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app
+  namespace: prod
+spec:
+  containers:
+  - name: app
+    image: my-app
+    envFrom:
+    - configMapRef:
+        name: app-config-prod
+```
+
+---
+
+### 12. 阿里 P7 加分项
+
+**架构设计能力**：
+- 设计过大规模 Kubernetes 集群（1000+ 节点）
+- 有多集群/多云 Kubernetes 管理经验
+- 实现过自定义 Controller 和 Operator
+
+**深度理解**：
+- 熟悉 Kubernetes 源码（调度器、控制器、网络模型）
+- 理解 Container Runtime（Docker、Containerd、CRI-O）
+- 有 CNI 插件开发经验
+
+**性能调优**：
+- 优化过 etcd 性能（存储压缩、快照策略）
+- 调整过 kubelet 参数（最大 Pod 数、镜像垃圾回收）
+- 优化过网络性能（CNI 插件选择、MTU 配置）
+
+**生产实践**：
+- 主导过从 Docker Swarm 迁移到 Kubernetes
+- 解决过生产环境的疑难问题（网络分区、etcd 数据恢复）
+- 实现过 Kubernetes 多租户隔离
+
+**云原生生态**：
+- 熟悉 Helm Chart 开发和模板化部署
+- 使用过 Prometheus + Grafana 监控 Kubernetes
+- 实现过 Kubernetes CI/CD 流程（GitOps、ArgoCD）
+
+**安全实践**：
+- 实现 Pod Security Standards（Pod Security Policy）
+- 有 RBAC 权限管理经验
+- 使用过 Falco/Kyverno 实现安全策略
+- 实现过镜像签名和验证（Notary）
+
+**成本优化**：
+- 使用 Cluster Autoscaler 自动扩缩节点
+- 实现过 Pod 优先级和抢占机制
+- 使用 Spot 实例降低成本
+- 资源配额和限制范围管理
diff --git a/questions/observability.md b/questions/observability.md
new file mode 100644
index 0000000..c6d838a
--- /dev/null
+++ b/questions/observability.md
@@ -0,0 +1,1179 @@
+# 可观测性 (Observability)
+
+## 问题
+
+**背景**：在分布式系统中，如何快速定位和解决问题成为关键挑战。可观测性通过监控、日志和链路追踪三大支柱，帮助开发和运维团队理解系统内部状态。
+
+**问题**：
+1. 什么是可观测性？它和监控有什么区别？
+2. 监控、日志、链路追踪三大支柱的作用是什么？
+3. Prometheus + Grafana 监控架构是如何设计的？
+4. ELK（Elasticsearch、Logstash、Kibana）日志栈如何搭建？
+5. 分布式追踪（Jaeger/Zipkin）的原理是什么？
+6. 如何设计监控告警规则？
+7. 如何实现全链路追踪？
+8. 如何定位性能瓶颈？
+9. 如何设计监控指标体系？
+10. 在实际项目中如何落地区观测性？
+
+---
+
+## 标准答案
+
+### 1. 可观测性概述
+
+#### **定义**：
+```
+可观测性（Observability）：
+通过系统外部输出（Metrics、Logs、Traces）推断系统内部状态的能力
+
+监控（Monitoring）：
+通过预定义的指标检查系统是否正常运行
+```
+
+#### **对比**：
+```
+监控（Monitoring）：
+├─ 主动询问系统状态（预设规则）
+├─ 关注已知问题（如 CPU 使用率 > 80%）
+└─ 问题：无法发现未知问题
+
+可观测性（Observability）：
+├─ 被动收集系统输出（数据驱动）
+├─ 可以发现未知问题
+└─ 支持根因分析（Root Cause Analysis）
+```
+
+#### **三大支柱**：
+```
+1. Metrics（指标）：数值型数据
+   - Counter（计数器）：请求数、错误数
+   - Gauge（仪表盘）：CPU 使用率、内存使用量
+   - Histogram（直方图）：请求延迟分布
+
+2. Logs（日志）：离散事件
+   - 应用日志：错误日志、调试日志
+   - 访问日志：Nginx access.log
+   - 审计日志：操作记录
+
+3. Traces（追踪）：请求路径
+   - Trace：一次完整的请求（从客户端到后端）
+   - Span：单个服务的处理过程
+   - Span ID、Trace ID：关联标识
+```
+
+---
+
+### 2. 三大支柱详解
+
+#### **Metrics（指标）**：
+```yaml
+# Prometheus 指标示例
+# 1. Counter（只增不减）
+http_requests_total{method="GET",path="/api/users",status="200"} 12345
+
+# 2. Gauge（可增可减）
+memory_usage_bytes{instance="localhost:8080"} 1073741824
+cpu_usage_percent{instance="localhost:8080"} 45.2
+
+# 3. Histogram（分布）
+http_request_duration_seconds_bucket{le="0.1"} 5000
+http_request_duration_seconds_bucket{le="0.5"} 9500
+http_request_duration_seconds_bucket{le="+Inf"} 10000
+```
+
+**代码示例（Spring Boot Actuator）**：
+```java
+@RestController
+public class UserController {
+
+    private final Counter requestCounter;
+    private final Gauge memoryGauge;
+
+    public UserController(MeterRegistry registry) {
+        this.requestCounter = Counter.builder("http.requests.total")
+            .tag("method", "GET")
+            .tag("path", "/api/users")
+            .register(registry);
+
+        this.memoryGauge = Gauge.builder("jvm.memory.used", Runtime.getRuntime(), Runtime::totalMemory)
+            .register(registry);
+    }
+
+    @GetMapping("/api/users")
+    public List<User> getUsers() {
+        requestCounter.increment();
+        return userService.findAll();
+    }
+}
+```
+
+#### **Logs（日志）**：
+```java
+// 结构化日志（JSON 格式）
+@Slf4j
+@RestController
+public class UserController {
+
+    @GetMapping("/api/users/{id}")
+    public User getUserById(@PathVariable Long id) {
+        log.info("Get user by id", logContext()
+            .with("userId", id)
+            .with("traceId", MDC.get("traceId"))
+            .with("spanId", MDC.get("spanId"))
+        );
+
+        User user = userService.findById(id);
+
+        if (user == null) {
+            log.warn("User not found", logContext()
+                .with("userId", id)
+                .with("traceId", MDC.get("traceId"))
+            );
+            throw new UserNotFoundException(id);
+        }
+
+        return user;
+    }
+
+    private LogContext logContext() {
+        return new LogContext();
+    }
+}
+
+// 日志输出
+{
+  "timestamp": "2024-01-01T10:00:00Z",
+  "level": "INFO",
+  "logger": "com.example.UserController",
+  "message": "Get user by id",
+  "userId": 123,
+  "traceId": "a1b2c3d4e5f6g7h8",
+  "spanId": "i9j0k1l2m3n4o5p6",
+  "thread": "http-nio-8080-exec-1"
+}
+```
+
+#### **Traces（追踪）**：
+```
+Trace（一次完整请求）：
+Client → Gateway → Service A → Service B → Service C
+   │         │           │            │            │
+   └─────────┴───────────┴────────────┴────────────┘
+                    Trace ID: abc123
+
+Span（单个服务处理）：
+Gateway (Span 1)
+  ├─ Service A (Span 2)
+  │   └─ Service B (Span 3)
+  │       └─ Service C (Span 4)
+```
+
+---
+
+### 3. Prometheus + Grafana 架构
+
+#### **架构图**：
+```
+                    ┌─────────────────┐
+                    │   Applications  │
+                    │  ( exporters )  │
+                    └─────────────────┘
+                           │
+                           │ /metrics
+                           │
+                    ┌─────────────────┐
+                    │  Prometheus     │
+                    │  (Pull 指标)     │
+                    └─────────────────┘
+                           │
+                           │ 存储
+                           ▼
+                    ┌─────────────────┐
+                    │ TSDB (时序数据库)│
+                    └─────────────────┘
+                           │
+                           │ 查询
+                           ▼
+                    ┌─────────────────┐
+                    │    Grafana      │
+                    │  (可视化仪表盘)  │
+                    └─────────────────┘
+                           │
+                           │ 告警
+                           ▼
+                    ┌─────────────────┐
+                    │  Alertmanager   │
+                    │  (告警路由)      │
+                    └─────────────────┘
+                           │
+                           │ 通知
+                           ▼
+                    ┌─────────────────┐
+                    │  Email/Webhook  │
+                    │  钉钉/企业微信    │
+                    └─────────────────┘
+```
+
+#### **Prometheus 配置**：
+```yaml
+# prometheus.yml
+global:
+  scrape_interval: 15s  # 每 15 秒采集一次
+  evaluation_interval: 15s  # 每 15 秒评估告警规则
+
+# 告警规则
+rule_files:
+  - "alerts/*.yml"
+
+# 抓取配置
+scrape_configs:
+  # Spring Boot Actuator
+  - job_name: 'spring-boot'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['localhost:8080']
+
+  # Kubernetes 服务发现
+  - job_name: 'kubernetes-pods'
+    kubernetes_sd_configs:
+      - role: pod
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+        action: keep
+        regex: true
+      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+        action: replace
+        target_label: __metrics_path__
+        regex: (.+)
+
+# 告警管理
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - alertmanager:9093
+```
+
+#### **Spring Boot 集成**：
+```xml
+<!-- pom.xml -->
+<dependency>
+    <groupId>org.springframework.boot</groupId>
+    <artifactId>spring-boot-starter-actuator</artifactId>
+</dependency>
+<dependency>
+    <groupId>io.micrometer</groupId>
+    <artifactId>micrometer-registry-prometheus</artifactId>
+</dependency>
+```
+
+```yaml
+# application.yml
+management:
+  endpoints:
+    web:
+      exposure:
+        include: prometheus,health,info
+  metrics:
+    export:
+      prometheus:
+        enabled: true
+    tags:
+      application: ${spring.application.name}
+```
+
+#### **Grafana Dashboard**：
+```json
+{
+  "dashboard": {
+    "title": "Spring Boot Metrics",
+    "panels": [
+      {
+        "title": "Request Rate",
+        "targets": [
+          {
+            "expr": "rate(http_server_requests_seconds_count[1m])",
+            "legendFormat": "{{method}} {{uri}}"
+          }
+        ],
+        "type": "graph"
+      },
+      {
+        "title": "Request Latency",
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.95, rate(http_server_requests_seconds_bucket[1m]))",
+            "legendFormat": "P95 Latency"
+          }
+        ],
+        "type": "graph"
+      },
+      {
+        "title": "JVM Memory Usage",
+        "targets": [
+          {
+            "expr": "jvm_memory_used_bytes{area=\"heap\"}",
+            "legendFormat": "Heap Used"
+          }
+        ],
+        "type": "graph"
+      }
+    ]
+  }
+}
+```
+
+---
+
+### 4. ELK 日志栈
+
+#### **架构图**：
+```
+                    ┌─────────────────┐
+                    │   Applications  │
+                    │  (日志输出)      │
+                    └─────────────────┘
+                           │
+                           │ Filebeat/Logstash
+                           │
+                    ┌─────────────────┐
+                    │    Logstash     │
+                    │  (日志处理)      │
+                    ├─────────────────┤
+                    │ - 过滤          │
+                    │ - 转换          │
+                    │ - 解析          │
+                    └─────────────────┘
+                           │
+                           │
+                    ┌─────────────────┐
+                    │  Elasticsearch  │
+                    │  (日志存储)      │
+                    └─────────────────┘
+                           │
+                           │ 查询
+                           ▼
+                    ┌─────────────────┐
+                    │     Kibana      │
+                    │  (日志可视化)    │
+                    └─────────────────┘
+```
+
+#### **Logstash 配置**：
+```conf
+# logstash.conf
+input {
+  file {
+    path => "/var/log/app/*.log"
+    start_position => "beginning"
+    codec => json
+  }
+
+  beats {
+    port => 5044
+  }
+}
+
+filter {
+  # 解析 JSON 日志
+  json {
+    source => "message"
+  }
+
+  # 提取时间戳
+  date {
+    match => ["timestamp", "ISO8601"]
+  }
+
+  # 提取 Trace ID
+  grok {
+    match => {
+      "message" => '"traceId":"%{DATA:traceId}"'
+    }
+  }
+
+  # 添加应用名称
+  mutate {
+    add_field => {
+      "application" => "my-app"
+    }
+  }
+}
+
+output {
+  elasticsearch {
+    hosts => ["elasticsearch:9200"]
+    index => "my-app-%{+YYYY.MM.dd}"
+  }
+
+  stdout {
+    codec => rubydebug
+  }
+}
+```
+
+#### **Filebeat 配置**：
+```yaml
+# filebeat.yml
+filebeat.inputs:
+  - type: log
+    enabled: true
+    paths:
+      - /var/log/app/*.log
+    json.keys_under_root: true
+    json.add_error_key: true
+    fields:
+      app: my-app
+      env: production
+
+output.logstash:
+  hosts: ["logstash:5044"]
+
+# 日志 multiline 处理
+multiline.type: pattern
+multiline.pattern: '^[0-9]{4}-[0-9]{2}-[0-9]{2}'
+multiline.negate: true
+multiline.match: after
+```
+
+#### **Kibana 查询**：
+```
+# 1. 简单查询
+level: "ERROR"
+
+# 2. 范围查询
+@timestamp: [now-1h TO now]
+
+# 3. 通配符
+message: "*NullPointerException*"
+
+# 4. 正则表达式
+message: /.*User \d+ not found.*/
+
+# 5. 聚合查询
+# 按错误级别统计
+level: "ERROR" | stats count by level
+
+# 按时间统计
+# histogram @timestamp, interval 1m
+
+# 按服务统计
+# terms appName
+
+# 6. 全链路追踪
+# 查询同一 Trace ID 的所有日志
+traceId: "a1b2c3d4e5f6g7h8"
+```
+
+---
+
+### 5. 分布式追踪
+
+#### **原理**：
+```
+1. 客户端请求生成 Trace ID
+2. 每个服务处理时生成 Span
+3. Span 记录：
+   - Span ID（当前 Span 唯一 ID）
+   - Parent Span ID（父 Span ID）
+   - Trace ID（全局 Trace ID）
+   - Timestamp（开始时间）
+   - Duration（耗时）
+   - Tags（标签）
+   - Logs（日志）
+4. Span 上报到 Jaeger/Zipkin
+5. 追踪系统构建调用链
+```
+
+#### **Jaeger 架构**：
+```
+                    ┌─────────────────┐
+                    │   Applications  │
+                    │  (Jaeger Client)│
+                    └─────────────────┘
+                           │
+                           │ UDP/HTTP
+                           │
+                    ┌─────────────────┐
+                    │     Agent       │
+                    │  (数据采集)      │
+                    └─────────────────┘
+                           │
+                           │
+                    ┌─────────────────┐
+                    │     Collector   │
+                    │  (数据处理)      │
+                    └─────────────────┘
+                           │
+                           │
+        ┌──────────────────┼──────────────────┐
+        │                  │                  │
+   ┌─────────┐       ┌──────────┐       ┌──────────┐
+   │Elasticsearch│    │  Cassandra│     │ Kafka    │
+   └─────────┘       └──────────┘       └──────────┘
+        │
+        │ 查询
+        ▼
+   ┌─────────┐
+   │  Query  │
+   │ Service │
+   └─────────┘
+        │
+        │ Web UI
+        ▼
+   ┌─────────┐
+   │   Web   │
+   │   UI    │
+   └─────────┘
+```
+
+#### **Spring Boot 集成 Jaeger**：
+```xml
+<!-- pom.xml -->
+<dependency>
+    <groupId>io.opentracing.contrib</groupId>
+    <artifactId>opentracing-spring-jaeger-web-starter</artifactId>
+</dependency>
+```
+
+```yaml
+# application.yml
+opentracing:
+  jaeger:
+    enabled: true
+    service-name: my-app
+    udp-sender:
+      host: jaeger-agent
+      port: 6831
+    sampler:
+      probability: 0.1  # 10% 采样
+```
+
+**代码示例**：
+```java
+@RestController
+public class UserController {
+
+    private final Tracer tracer;
+
+    @GetMapping("/api/users/{id}")
+    public User getUserById(@PathVariable Long id) {
+        // 创建自定义 Span
+        Span span = tracer.buildSpan("getUserById")
+            .withTag("userId", id)
+            .start();
+
+        try (Scope scope = tracer.scopeManager().activate(span)) {
+            User user = userService.findById(id);
+
+            if (user == null) {
+                span.setTag("error", true);
+                span.log("User not found");
+                throw new UserNotFoundException(id);
+            }
+
+            return user;
+        } finally {
+            span.finish();
+        }
+    }
+}
+```
+
+#### **Zipkin 集成**：
+```xml
+<!-- pom.xml -->
+<dependency>
+    <groupId>org.springframework.cloud</groupId>
+    <artifactId>spring-cloud-starter-zipkin</artifactId>
+</dependency>
+```
+
+```yaml
+# application.yml
+spring:
+  zipkin:
+    base-url: http://zipkin:9411
+  sleuth:
+    sampler:
+      probability: 0.1  # 10% 采样
+```
+
+---
+
+### 6. 监控告警规则
+
+#### **Prometheus 告警规则**：
+```yaml
+# alerts.yml
+groups:
+  - name: application_alerts
+    interval: 30s
+    rules:
+      # 高错误率
+      - alert: HighErrorRate
+        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High error rate detected"
+          description: "Error rate is {{ $value }} errors/sec"
+
+      # 高延迟
+      - alert: HighLatency
+        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High latency detected"
+          description: "P95 latency is {{ $value }} seconds"
+
+      # 服务下线
+      - alert: ServiceDown
+        expr: up{job="spring-boot"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service is down"
+          description: "{{ $labels.instance }} is down"
+
+      # JVM 内存使用率高
+      - alert: HighMemoryUsage
+        expr: jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} > 0.9
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage"
+          description: "Heap memory usage is {{ $value | humanizePercentage }}"
+
+      # 磁盘空间不足
+      - alert: DiskSpaceLow
+        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk space low"
+          description: "Disk space is {{ $value | humanizePercentage }} available"
+```
+
+#### **Alertmanager 配置**：
+```yaml
+# alertmanager.yml
+global:
+  resolve_timeout: 5m
+
+# 路由配置
+route:
+  group_by: ['alertname', 'cluster', 'service']
+  group_wait: 10s
+  group_interval: 10s
+  repeat_interval: 12h
+  receiver: 'default'
+
+  routes:
+    - match:
+        severity: critical
+      receiver: 'critical'
+      continue: true
+
+    - match:
+        severity: warning
+      receiver: 'warning'
+
+# 接收器配置
+receivers:
+  - name: 'default'
+    webhook_configs:
+      - url: 'http://webhook-server/default'
+
+  - name: 'critical'
+    webhook_configs:
+      - url: 'http://webhook-server/critical'
+    email_configs:
+      - to: 'oncall@example.com'
+        from: 'alertmanager@example.com'
+        smarthost: 'smtp.example.com:587'
+        auth_username: 'alertmanager@example.com'
+        auth_password: 'password'
+
+  - name: 'warning'
+    webhook_configs:
+      - url: 'http://webhook-server/warning'
+
+# 抑制规则
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'cluster', 'service']
+```
+
+#### **钉钉告警**：
+```python
+# 钉钉 Webhook 示例
+from flask import Flask, request
+import requests
+import json
+
+app = Flask(__name__)
+
+@app.route('/alertmanager', methods=['POST'])
+def alertmanager():
+    data = request.json
+
+    for alert in data.get('alerts', []):
+        status = alert.get('status')
+        labels = alert.get('labels', {})
+        annotations = alert.get('annotations', {})
+
+        message = {
+            "msgtype": "markdown",
+            "markdown": {
+                "title": f"Alert: {labels.get('alertname')}",
+                "text": f"""
+### {labels.get('alertname')}
+
+**Status:** {status}
+**Severity:** {labels.get('severity')}
+**Instance:** {labels.get('instance')}
+
+**Summary:** {annotations.get('summary')}
+**Description:** {annotations.get('description')}
+
+**Starts:** {alert.get('startsAt')}
+                """
+            }
+        }
+
+        requests.post(
+            'https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN',
+            json=message
+        )
+
+    return 'OK'
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=5000)
+```
+
+---
+
+### 7. 全链路追踪
+
+#### **实现方案**：
+```
+1. 客户端生成 Trace ID
+2. HTTP Header 传递 Trace ID
+   - X-Trace-Id
+   - X-Span-Id
+3. 每个服务记录 Span
+4. 异步上报到 Jaeger/Zipkin
+5. 追踪系统构建调用链
+```
+
+#### **Spring Cloud Sleuth 实现**：
+```java
+// 1. 配置 Sleuth
+@Configuration
+public class TracingConfig {
+
+    @Bean
+    public HttpTraceCustomizer httpTraceCustomizer() {
+        return (builder) -> builder.include(EVERYTHING);
+    }
+}
+
+// 2. RestTemplate 传递 Trace ID
+@Configuration
+public class RestTemplateConfig {
+
+    @Bean
+    public RestTemplate restTemplate() {
+        return new RestTemplate();
+    }
+
+    @Bean
+    public RestTemplateCustomizer restTemplateCustomizer(Tracer tracer) {
+        return restTemplate -> {
+            restTemplate.setInterceptors(Collections.singletonList(new ClientHttpRequestInterceptor() {
+                @Override
+                public ClientHttpResponse intercept(HttpRequest request, byte[] body, ClientHttpRequestExecution execution) throws IOException {
+                    Span span = tracer.activeSpan();
+                    if (span != null) {
+                        request.getHeaders().add("X-Trace-Id", span.context().traceId());
+                        request.getHeaders().add("X-Span-Id", span.context().spanId());
+                    }
+                    return execution.execute(request, body);
+                }
+            }));
+        };
+    }
+}
+
+// 3. Kafka 消息传递 Trace ID
+@Configuration
+public class KafkaConfig {
+
+    @Bean
+    public ProducerFactory<String, String> producerFactory(Tracer tracer) {
+        Map<String, Object> configProps = new HashMap<>();
+        configProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
+
+        return new DefaultKafkaProducerFactory<>(configProps,
+            new StringSerializer(),
+            new StringSerializer());
+    }
+}
+
+// 4. 数据库查询传递 Trace ID
+@Configuration
+public class DatabaseConfig {
+
+    @Bean
+    public DataSource dataSource(Tracer tracer) {
+        HikariDataSource dataSource = new HikariDataSource();
+        dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/db");
+
+        dataSource.setConnectionTestQuery("SELECT 1");
+        dataSource.setConnectionInitSql("SET @trace_id = '" + tracer.activeSpan().context().traceId() + "'");
+
+        return dataSource;
+    }
+}
+```
+
+#### **Trace ID 关联日志**：
+```java
+// 使用 MDC 传递 Trace ID
+@Slf4j
+@Component
+public class TraceIdFilter implements Filter {
+
+    @Override
+    public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException {
+        String traceId = request.getHeader("X-Trace-Id");
+        if (traceId == null) {
+            traceId = UUID.randomUUID().toString();
+        }
+
+        MDC.put("traceId", traceId);
+
+        try {
+            chain.doFilter(request, response);
+        } finally {
+            MDC.clear();
+        }
+    }
+}
+
+// Logback 配置
+<configuration>
+    <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
+        <encoder>
+            <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - traceId=%X{traceId} - %msg%n</pattern>
+        </encoder>
+    </appender>
+
+    <root level="INFO">
+        <appender-ref ref="CONSOLE" />
+    </root>
+</configuration>
+```
+
+---
+
+### 8. 性能瓶颈定位
+
+#### **定位流程**：
+```
+1. 监控告警（Prometheus）
+   - CPU 使用率高
+   - 内存使用率高
+   - 请求延迟高
+
+2. 链路追踪（Jaeger）
+   - 定位慢请求
+   - 找出耗时最长的服务
+
+3. 日志分析（ELK）
+   - 查找错误日志
+   - 分析异常堆栈
+
+4. 性能分析（Profiling）
+   - CPU Profiling
+   - Memory Profiling
+   - Thread Dump
+```
+
+#### **案例 1：慢查询定位**
+```sql
+-- 1. 开启 MySQL 慢查询日志
+SET GLOBAL slow_query_log = 'ON';
+SET GLOBAL long_query_time = 1;
+
+-- 2. 分析慢查询日志
+pt-query-digest /var/log/mysql/slow.log
+
+-- 3. 优化 SQL
+-- 添加索引
+CREATE INDEX idx_user_email ON users(email);
+
+-- 重写查询
+-- Before
+SELECT * FROM users WHERE LOWER(email) = 'alice@example.com';
+
+-- After
+SELECT * FROM users WHERE email = 'alice@example.com';
+```
+
+#### **案例 2：内存泄漏定位**
+```bash
+# 1. 导出堆转储
+jmap -dump:format=b,file=heap.hprof <pid>
+
+# 2. 使用 MAT 分析
+# - 查看 Dominator Tree
+# - 查找 Leak Suspects
+# - 查看 Histogram
+
+# 3. 定位泄漏代码
+# - 未关闭的资源（Connection、Stream）
+# - 静态集合持有大对象
+# - 缓存未设置过期时间
+```
+
+#### **案例 3：CPU 高负载定位**
+```bash
+# 1. 查看 CPU 使用率
+top -p <pid>
+
+# 2. 导出线程快照
+jstack <pid> > thread.dump
+
+# 3. 查找繁忙线程
+printf "%x\n" <tid>  # 转换为 16 进制
+grep -A 20 <tid-hex> thread.dump
+
+# 4. 分析代码
+# - 死循环
+# - 正则表达式（回溯）
+# - 大对象序列化
+```
+
+---
+
+### 9. 监控指标体系
+
+#### **分层指标**：
+```
+1. 基础设施层（Infrastructure）
+   - CPU 使用率
+   - 内存使用率
+   - 磁盘 I/O
+   - 网络流量
+
+2. 平台层（Platform）
+   - Kubernetes 集群健康
+   - Pod 数量
+   - Node 状态
+
+3. 中间件层（Middleware）
+   - Redis：连接数、命令执行时间、内存使用率
+   - MySQL：QPS、慢查询、连接数、主从延迟
+   - Kafka：消息积压、消费延迟
+
+4. 应用层（Application）
+   - QPS（每秒请求数）
+   - Latency（延迟 P50、P95、P99）
+   - Error Rate（错误率）
+   - Saturation（饱和度）
+
+5. 业务层（Business）
+   - 订单量
+   - 支付成功率
+   - 用户活跃度
+```
+
+#### **RED 方法**：
+```
+R - Rate (请求速率)
+- QPS（Queries Per Second）
+- RPS（Requests Per Second）
+
+E - Errors (错误率)
+- HTTP 5xx 错误率
+- 业务异常率
+
+D - Duration (请求耗时)
+- P50（中位数）
+- P95（95 分位）
+- P99（99 分位）
+```
+
+#### **USE 方法**：
+```
+U - Utilization (资源利用率)
+- CPU 使用率
+- 内存使用率
+- 磁盘使用率
+
+S - Saturation (资源饱和度)
+- CPU 运行队列长度
+- 内存 Swap 使用量
+- 磁盘 I/O 等待时间
+
+E - Errors (错误数)
+- 硬件错误（ECC、磁盘坏道）
+- 软件错误（OOM、连接超时）
+```
+
+#### **Grafana Dashboard 示例**：
+```json
+{
+  "dashboard": {
+    "title": "Service Overview",
+    "panels": [
+      {
+        "title": "QPS",
+        "targets": [
+          {
+            "expr": "sum(rate(http_requests_total[1m]))"
+          }
+        ]
+      },
+      {
+        "title": "Error Rate",
+        "targets": [
+          {
+            "expr": "sum(rate(http_requests_total{status=~\"5..\"}[1m])) / sum(rate(http_requests_total[1m]))"
+          }
+        ]
+      },
+      {
+        "title": "P95 Latency",
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[1m]))"
+          }
+        ]
+      },
+      {
+        "title": "CPU Usage",
+        "targets": [
+          {
+            "expr": "rate(process_cpu_seconds_total[1m])"
+          }
+        ]
+      },
+      {
+        "title": "Memory Usage",
+        "targets": [
+          {
+            "expr": "jvm_memory_used_bytes{area=\"heap\"} / jvm_memory_max_bytes{area=\"heap\"}"
+          }
+        ]
+      }
+    ]
+  }
+}
+```
+
+---
+
+### 10. 实际项目落地
+
+#### **场景 1：电商系统监控**
+```
+需求：
+- 监控订单接口性能
+- 发现慢查询并优化
+- 监控支付成功率
+
+方案：
+1. Prometheus 监控
+   - QPS、延迟、错误率
+   - JVM 指标
+   - MySQL 慢查询
+
+2. Jaeger 链路追踪
+   - 订单创建流程
+   - 支付流程
+
+3. ELK 日志分析
+   - 订单日志
+   - 支付日志
+
+4. Grafana 仪表盘
+   - 业务指标（订单量、支付成功率）
+   - 技术指标（QPS、延迟）
+```
+
+#### **场景 2：微服务链路追踪**
+```
+需求：
+- 追踪跨服务请求
+- 定位性能瓶颈
+- 分析服务依赖
+
+方案：
+1. Spring Cloud Sleuth 生成 Trace ID
+2. Jaeger 收集 Span
+3. Kibana 关联日志（通过 Trace ID）
+4. Prometheus 监控每个服务性能
+
+示例：
+用户下单
+├─ 订单服务（创建订单）
+├─ 库存服务（扣减库存）
+├─ 支付服务（创建支付）
+└─ 物流服务（分配物流）
+
+通过 Trace ID 关联所有服务的日志
+```
+
+---
+
+### 11. 阿里 P7 加分项
+
+**架构设计能力**：
+- 设计过企业级可观测性平台（统一监控、日志、追踪）
+- 有多集群、多地域的监控架构经验
+- 实现过自定义监控 Agent 和 Collector
+
+**深度理解**：
+- 熟悉 Prometheus 内部机制（TSDB、存储引擎、查询引擎）
+- 理解 Elasticsearch 底层原理（Lucene、分片、副本）
+- 有 Jaeger/Zipkin 源码阅读经验
+
+**性能优化**：
+- 优化过 Prometheus 查询性能（Recording Rules、联邦）
+- 优化过 Elasticsearch 索引性能（分片策略、Mapping 设计）
+- 优化过日志采集性能（采样率、批量上传）
+
+**生产实践**：
+- 解决过海量数据存储和查询问题（数据降采样、冷热分离）
+- 实现过智能告警（动态阈值、异常检测、机器学习）
+- 有故障快速定位经验（根因分析、故障复盘）
+
+**开源贡献**：
+- 向 Prometheus/Grafana/Jaeger 社区提交过 PR
+- 开发过自定义 Exporter
+- 编写过相关技术博客或演讲
+
+**可观测性最佳实践**：
+- 实现 SLO/SLI（Service Level Objective/Indicator）
+- 使用 Error Budget 管理发布节奏
+- 有混沌工程实践（Chaos Engineering）
+- 实现 APM（Application Performance Monitoring）
+
+**业务监控**：
+- 设计过业务指标大盘
+- 实现过实时数据大屏（Druid、ClickHouse）
+- 有用户行为分析经验（埋点、漏斗分析）
diff --git a/questions/rpc-framework.md b/questions/rpc-framework.md
new file mode 100644
index 0000000..929fe07
--- /dev/null
+++ b/questions/rpc-framework.md
@@ -0,0 +1,745 @@
+# RPC 框架
+
+## 问题
+
+**背景**：在分布式系统中，服务间通信需要高效、可靠的远程调用机制。RPC（Remote Procedure Call）框架屏蔽了网络通信的复杂性，使远程调用像本地调用一样简单。
+
+**问题**：
+1. 什么是 RPC？它和 HTTP REST 有什么区别？
+2. Dubbo 的核心架构和工作原理是什么？
+3. gRPC 的优势是什么？它如何实现高性能？
+4. 请描述 Dubbo 的负载均衡策略
+5. Dubbo 的服务注册与发现机制是怎样的？
+6. RPC 框架如何实现序列化？常见的序列化协议有哪些？
+7. 在实际项目中如何选择 RPC 框架？
+8. RPC 框架如何处理超时、重试和熔断？
+
+---
+
+## 标准答案
+
+### 1. RPC vs HTTP REST
+
+#### **RPC 定义**：
+远程过程调用（Remote Procedure Call）是一种计算机通信协议，允许运行在一台计算机的程序调用另一台计算机的子程序，而开发者无需额外编码这种交互。
+
+#### **对比表**：
+
+| 特性 | RPC (Dubbo/gRPC) | HTTP REST |
+|------|------------------|-----------|
+| 传输协议 | TCP (长连接) | HTTP/1.1 (短连接) / HTTP/2 |
+| 序列化 | 二进制（Hessian/Protobuf） | JSON/XML |
+| 性能 | 高（紧凑、高效） | 中（文本解析开销） |
+| 易用性 | 需要接口定义 | 无需定义，浏览器直接访问 |
+| 耦合度 | 强耦合（需要 stub 代码） | 松耦合 |
+| 流量管理 | 需要网关 | 天然支持（Nginx等） |
+| 适用场景 | 内部微服务通信 | 对外 API、跨语言调用 |
+
+#### **代码对比**：
+
+**RPC 调用（Dubbo）**：
+```java
+// 服务提供者
+public interface UserService {
+    User getUserById(Long id);
+}
+
+// 服务消费者
+// 像调用本地方法一样调用远程服务
+@Reference
+private UserService userService;
+
+public void process() {
+    User user = userService.getUserById(1L);
+}
+```
+
+**HTTP REST 调用**：
+```java
+// 服务提供者
+@RestController
+@RequestMapping("/api/users")
+public class UserController {
+    @GetMapping("/{id}")
+    public User getUserById(@PathVariable Long id) {
+        return userService.getById(id);
+    }
+}
+
+// 服务消费者
+RestTemplate restTemplate = new RestTemplate();
+public void process() {
+    String url = "http://user-service/api/users/1";
+    User user = restTemplate.getForObject(url, User.class);
+}
+```
+
+---
+
+### 2. Dubbo 核心架构
+
+#### **架构图**：
+```
+                    ┌─────────────────┐
+                    │   Registry      │
+                    │  (注册中心)      │
+                    │  Zookeeper/Nacos│
+                    └─────────────────┘
+                           ▲   ▲
+                           │   │
+          Register         │   │         Subscribe
+          (注册)            │   │          (订阅)
+                           │   │
+    ┌──────────────────────┴───┴──────────────────────┐
+    │                                              │
+    │  Provider                        Consumer     │
+    │  ┌──────────┐                    ┌──────────┐│
+    │  │Protocol  │                    │Protocol  ││
+    │  │  (协议层) │                    │  (协议层) ││
+    │  └──────────┘                    └──────────┘│
+    │  ┌──────────┐                    ┌──────────┐│
+    │  │  Cluster │◄──────────────────►│  Cluster ││
+    │  │  (集群层) │    Directory      │  (集群层) ││
+    │  └──────────┘                    └──────────┘│
+    │  ┌──────────┐                    ┌──────────┐│
+    │  │   Proxy  │                    │   Proxy  ││
+    │  │ (代理层)  │                    │ (代理层) ││
+    │  └──────────┘                    └──────────┘│
+    │  ┌──────────┐                    ┌──────────┐│
+    │  │  Service │                    │  Service ││
+    │  │  (服务层) │                    │  (服务层) ││
+    │  └──────────┘                    └──────────┘│
+    └─────────────────────────────────────────────┘
+                │
+                │ Invoke
+                │ (调用)
+                ▼
+         ┌──────────┐
+         │  Channel │
+         │ (网络层)  │
+         └──────────┘
+         │
+         │ Exchange
+         │ (数据交换)
+         ▼
+         ┌──────────┐
+         │  Serialize│
+         │  (序列化)  │
+         └──────────┘
+```
+
+#### **核心角色**：
+
+**1. Container（服务容器）**
+- 负责启动、加载和运行服务提供者
+- 通常是 Spring 容器
+
+**2. Provider（服务提供者）**
+- 暴露服务的应用
+- 启动时向注册中心注册服务
+
+**3. Consumer（服务消费者）**
+- 调用远程服务的应用
+- 启动时向注册中心订阅服务
+
+**4. Registry（注册中心）**
+- 服务注册与发现
+- 常见实现：Zookeeper、Nacos、Redis
+
+**5. Monitor（监控中心）**
+- 统计服务调用次数和调用时间
+- 常见实现：Dubbo Admin、Prometheus
+
+#### **代码示例**：
+
+**服务提供者配置**：
+```xml
+<!-- provider.xml -->
+<?xml version="1.0" encoding="UTF-8"?>
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xmlns:dubbo="http://dubbo.apache.org/schema/dubbo"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+       http://www.springframework.org/schema/beans/spring-beans.xsd
+       http://dubbo.apache.org/schema/dubbo
+       http://dubbo.apache.org/schema/dubbo/dubbo.xsd">
+
+    <!-- 提供方应用信息 -->
+    <dubbo:application name="user-provider"/>
+
+    <!-- 使用 Zookeeper 注册中心 -->
+    <dubbo:registry address="zookeeper://127.0.0.1:2181"/>
+
+    <!-- 使用 dubbo 协议暴露服务 -->
+    <dubbo:protocol name="dubbo" port="20880"/>
+
+    <!-- 声明需要暴露的服务接口 -->
+    <dubbo:service interface="com.example.UserService"
+                   ref="userService" version="1.0.0"/>
+
+    <!-- 服务实现 -->
+    <bean id="userService" class="com.example.UserServiceImpl"/>
+</beans>
+```
+
+**服务消费者配置**：
+```xml
+<!-- consumer.xml -->
+<?xml version="1.0" encoding="UTF-8"?>
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xmlns:dubbo="http://dubbo.apache.org/schema/dubbo"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+       http://www.springframework.org/schema/beans/spring-beans.xsd
+       http://dubbo.apache.org/schema/dubbo
+       http://dubbo.apache.org/schema/dubbo/dubbo.xsd">
+
+    <!-- 消费方应用信息 -->
+    <dubbo:application name="user-consumer"/>
+
+    <!-- 使用 Zookeeper 注册中心 -->
+    <dubbo:registry address="zookeeper://127.0.0.1:2181"/>
+
+    <!-- 生成远程服务代理 -->
+    <dubbo:reference id="userService"
+                     interface="com.example.UserService"
+                     version="1.0.0"
+                     timeout="3000"
+                     retries="2"/>
+</beans>
+```
+
+---
+
+### 3. gRPC 高性能原理
+
+#### **核心特性**：
+
+**1. HTTP/2 多路复用**
+```
+HTTP/1.1:
+Request 1 ──► TCP Connection 1 ──► Response 1
+Request 2 ──► TCP Connection 2 ──► Response 2
+Request 3 ──► TCP Connection 3 ──► Response 3
+
+HTTP/2:
+Request 1 ──┐
+Request 2 ──┼─► TCP Connection ──► Response 1
+Request 3 ──┘                       Response 2
+                                     Response 3
+```
+
+**2. Protobuf 二进制序列化**
+```protobuf
+// user.proto
+syntax = "proto3";
+
+package user;
+
+service UserService {
+  rpc GetUser(GetUserRequest) returns (User);
+  rpc ListUsers(ListUsersRequest) returns (ListUsersResponse);
+}
+
+message User {
+  int64 id = 1;
+  string name = 2;
+  string email = 3;
+}
+
+message GetUserRequest {
+  int64 id = 1;
+}
+
+message ListUsersRequest {
+  int32 page = 1;
+  int32 size = 2;
+}
+
+message ListUsersResponse {
+  repeated User users = 1;
+  int32 total = 2;
+}
+```
+
+**性能对比**：
+```
+JSON: {"id":1,"name":"Alice","email":"alice@example.com"}
+     └─ 56 字节
+
+Protobuf: [0x08 0x01 0x12 0x05 0x41 0x6C 0x69 0x63 0x65 ...]
+        └─ ~20 字节（压缩 60%+）
+```
+
+**3. 流式传输**
+```python
+# 服务端流式 RPC
+async def ListUsers(request, context):
+    for user in database.iter_users():
+        yield user  # 持续发送，无需等待全部数据
+
+# 客户端流式 RPC
+async def UploadUsers(request_iterator, context):
+    for user_request in request_iterator:
+        database.save(user_request.user)
+    return UploadStatus(success=True)
+
+# 双向流式 RPC
+async def Chat(request_iterator, context):
+    async for msg in request_iterator:
+        response = process_message(msg)
+        yield response
+```
+
+#### **代码示例（Python）**：
+
+**服务端**：
+```python
+import grpc
+from concurrent import futures
+import user_pb2
+import user_pb2_grpc
+
+class UserServiceImpl(user_pb2_grpc.UserServiceServicer):
+    def GetUser(self, request, context):
+        # 查询数据库
+        user = db.query(User).filter_by(id=request.id).first()
+        return user_pb2.User(
+            id=user.id,
+            name=user.name,
+            email=user.email
+        )
+
+    def ListUsers(self, request, context):
+        # 服务端流式响应
+        users = db.query(User).limit(request.size).offset(request.page * request.size)
+        for user in users:
+            yield user_pb2.User(id=user.id, name=user.name, email=user.email)
+
+def serve():
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    user_pb2_grpc.add_UserServiceServicer_to_server(UserServiceImpl(), server)
+    server.add_insecure_port('[::]:50051')
+    server.start()
+    server.wait_for_termination()
+
+if __name__ == '__main__':
+    serve()
+```
+
+**客户端**：
+```python
+import grpc
+import user_pb2
+import user_pb2_grpc
+
+def run():
+    with grpc.insecure_channel('localhost:50051') as channel:
+        stub = user_pb2_grpc.UserServiceStub(channel)
+
+        # 简单 RPC
+        response = stub.GetUser(user_pb2.GetUserRequest(id=1))
+        print(f"User: {response.name}")
+
+        # 服务端流式 RPC
+        for user in stub.ListUsers(user_pb2.ListUsersRequest(page=0, size=10)):
+            print(f"User: {user.name}")
+
+if __name__ == '__main__':
+    run()
+```
+
+---
+
+### 4. Dubbo 负载均衡策略
+
+#### **策略对比**：
+
+| 策略 | 说明 | 适用场景 |
+|------|------|----------|
+| Random（随机） | 随机选择 provider | 性能相近的实例 |
+| RoundRobin（轮询） | 按权重轮询 | 性能有差异的实例 |
+| LeastActive（最少活跃） | 优先调用活跃数少的 | 性能差异大 |
+| ConsistentHash（一致性哈希） | 相同参数路由到同一 provider | 有状态服务 |
+| ShortestResponse（最短响应） | 优先选择响应时间短的 | 对延迟敏感 |
+
+#### **代码示例**：
+
+**配置负载均衡**：
+```xml
+<dubbo:reference id="userService"
+                 interface="com.example.UserService"
+                 loadbalance="roundRobin"  <!-- 轮询 -->
+                 timeout="3000"/>
+```
+
+**自定义负载均衡**：
+```java
+public class CustomLoadBalance extends AbstractLoadBalance {
+    @Override
+    protected <T> Invoker<T> doSelect(List<Invoker<T>> invokers, URL url, Invocation invocation) {
+        // 自定义负载均衡逻辑
+        // 例如：基于地理位置的负载均衡
+        String location = getUserLocation();
+        return invokers.stream()
+            .filter(invoker -> invoker.getUrl().getParameter("location").equals(location))
+            .findFirst()
+            .orElse(invokers.get(0));
+    }
+}
+
+// 注册自定义负载均衡
+SPI.register(CustomLoadBalance.class);
+```
+
+#### **LeastActive 原理**：
+```
+Provider A: Active = 5 (正在处理 5 个请求)
+Provider B: Active = 2 (正在处理 2 个请求)
+Provider C: Active = 8 (正在处理 8 个请求)
+
+选择顺序：B > A > C
+原因：B 的负载最轻，应该优先分配
+```
+
+---
+
+### 5. 服务注册与发现
+
+#### **Zookeeper 实现**：
+
+**目录结构**：
+```
+/dubbo
+  └─ com.example.UserService
+      ├─ providers
+      │   ├─ dubbo://192.168.1.10:20880/...?version=1.0.0
+      │   ├─ dubbo://192.168.1.11:20880/...?version=1.0.0
+      │   └─ dubbo://192.168.1.12:20880/...?version=1.0.0
+      └─ consumers
+          └─ consumer://192.168.1.20/...?version=1.0.0
+```
+
+**工作流程**：
+```
+1. Provider 启动
+   ↓
+2. 创建临时节点 /dubbo/.../providers/dubbo://ip:port/...
+   ↓
+3. Consumer 启动
+   ↓
+4. 订阅 /dubbo/.../providers/ 节点
+   ↓
+5. 获取 provider 列表
+   ↓
+6. 监听 provider 变化（新增/下线）
+   ↓
+7. 动态更新本地缓存
+```
+
+#### **代码示例（Zookeeper）**：
+```java
+// 注册中心配置
+RegistryConfig registry = new RegistryConfig();
+registry.setAddress("zookeeper://127.0.0.1:2181");
+registry.setTimeout(5000);
+
+// 或者使用 Nacos
+RegistryConfig registry = new RegistryConfig();
+registry.setAddress("nacos://127.0.0.1:8848");
+```
+
+#### **服务健康检查**：
+```java
+// Dubbo 心跳机制
+public class HeartbeatTask implements Runnable {
+    @Override
+    public void run() {
+        // 每隔 5 秒发送心跳
+        channel.send heartbeat();
+    }
+}
+
+// Zookeeper 临时节点特性
+// - Provider 断开连接后，临时节点自动删除
+// - Consumer 立即感知到下线，剔除该 provider
+```
+
+---
+
+### 6. 序列化协议对比
+
+#### **常见序列化协议**：
+
+| 协议 | 优点 | 缺点 | 适用场景 |
+|------|------|------|----------|
+| Hessian | 简单、高效 | 不支持跨语言 | Dubbo 默认 |
+| Protobuf | 高性能、跨语言 | 需要定义 .proto | gRPC |
+| JSON | 易读、跨语言 | 冗长、解析慢 | HTTP REST |
+| Kryo | 高性能 | 不支持跨语言 | Dubbo |
+| Avro | 动态 schema、跨语言 | 性能略低 | Hadoop 生态 |
+| FST | 高性能、兼容 JDK | 不支持跨语言 | Dubbo |
+
+#### **性能对比**：
+```
+序列化性能排名（从快到慢）：
+Kryo > FST > Protobuf > Hessian > Avro > JSON
+
+序列化后大小排名（从小到大）：
+Protobuf ≈ Kryo < Hessian < Avro < JSON
+```
+
+#### **代码示例（Protobuf）**：
+```protobuf
+// user.proto
+syntax = "proto3";
+
+message User {
+    int64 id = 1;
+    string name = 2;
+    string email = 3;
+    repeated string tags = 4;
+}
+```
+
+```bash
+# 编译 Protobuf
+protoc --python_out=. user.proto
+```
+
+```python
+# Python 序列化
+import user_pb2
+
+user = user_pb2.User()
+user.id = 1
+user.name = "Alice"
+user.email = "alice@example.com"
+user.tags.extend(["vip", "active"])
+
+# 序列化
+serialized = user.SerializeToString()  # 二进制数据
+
+# 反序列化
+user2 = user_pb2.User()
+user2.ParseFromString(serialized)
+```
+
+---
+
+### 7. RPC 框架选型
+
+#### **选型决策树**：
+```
+是否需要跨语言调用？
+├─ 是 → gRPC（Protobuf 跨语言支持最好）
+└─ 否 → 继续判断
+
+是否需要高性能？
+├─ 是 → Dubbo（TCP 长连接、Hessian 序列化）
+└─ 否 → 继续判断
+
+是否需要简单易用？
+├─ 是 → Spring Cloud OpenFeign（基于 HTTP REST）
+└─ 否 → Dubbo
+
+已有技术栈？
+├─ Spring Cloud → OpenFeign/Dubbo
+├─ Kubernetes → gRPC（服务网格友好）
+└─ Dubbo → 继续使用 Dubbo
+```
+
+#### **实际项目经验**：
+
+**场景 1：电商内部服务**
+```
+选择：Dubbo
+原因：
+- 内部服务，都是 Java 技术栈
+- 对性能要求高（高并发下单）
+- 需要负载均衡、熔断降级
+
+配置：
+- 使用 Hessian 序列化
+- Zookeeper 注册中心
+- LeastActive 负载均衡
+```
+
+**场景 2：跨语言微服务**
+```
+选择：gRPC
+原因：
+- 后端 Java，数据分析 Python，AI 服务 Go
+- 需要统一的服务间通信协议
+- Protobuf 高性能且跨语言
+
+配置：
+- Protobuf 定义接口
+- HTTP/2 传输
+- 多语言代码生成
+```
+
+---
+
+### 8. 超时、重试和熔断
+
+#### **超时配置**：
+```xml
+<!-- Dubbo 超时 -->
+<dubbo:reference id="userService"
+                 interface="com.example.UserService"
+                 timeout="3000"/>  <!-- 3 秒超时 -->
+
+<!-- 方法级超时 -->
+<dubbo:reference id="userService"
+                 interface="com.example.UserService">
+    <dubbo:method name="getUserById" timeout="1000"/>
+    <dubbo:method name="listUsers" timeout="5000"/>
+</dubbo:reference>
+```
+
+#### **重试机制**：
+```xml
+<dubbo:reference id="userService"
+                 interface="com.example.UserService"
+                 retries="2"/>  <!-- 失败后重试 2 次 -->
+
+<!-- 工作流程 -->
+第一次调用 → 失败
+    ↓
+第二次调用 → 失败
+    ↓
+第三次调用 → 成功/失败
+```
+
+**注意**：幂等性操作才能重试（如查询），非幂等操作（如下单）不能重试
+
+```xml
+<!-- 非幂等操作禁用重试 -->
+<dubbo:method name="createOrder" retries="0"/>
+```
+
+#### **熔断降级（Dubbo）**：
+```java
+// 使用 Sentinel 实现熔断
+@SentinelResource(value = "getUserById",
+    blockHandler = "handleBlock",
+    fallback = "handleFallback")
+public User getUserById(Long id) {
+    return userService.getUserById(id);
+}
+
+// 熔断处理
+public User handleBlock(Long id, BlockException ex) {
+    // 熔断时返回默认值
+    return new User(-1L, "Default", "default@example.com");
+}
+
+// 降级处理
+public User handleFallback(Long id, Throwable ex) {
+    // 异常时返回降级数据
+    return new User(-1L, "Fallback", "fallback@example.com");
+}
+```
+
+**熔断规则配置**：
+```java
+// Sentinel 熔断规则
+List<DegradeRule> rules = new ArrayList<>();
+DegradeRule rule = new DegradeRule();
+rule.setResource("getUserById");
+rule.setGrade(RuleConstant.DEGRADE_GRADE_RT);  // 平均响应时间
+rule.setCount(100);  // 100ms
+rule.setTimeWindow(10);  // 10 秒熔断时间
+rules.add(rule);
+DegradeRuleManager.loadRules(rules);
+```
+
+---
+
+### 9. 实际项目经验
+
+#### **场景 1：订单系统性能优化**
+```
+问题：订单创建接口延迟高（2 秒）
+排查：
+1. 调用链追踪发现库存服务耗时最长
+2. 库存服务使用 HTTP REST，JSON 序列化慢
+3. 每次调用都建立新连接
+
+解决：
+1. 将库存服务从 HTTP REST 迁移到 Dubbo
+2. 使用 Hessian 序列化
+3. 启用长连接复用
+4. 配置 LeastActive 负载均衡
+
+结果：延迟降低到 300ms（提升 85%）
+```
+
+#### **场景 2：服务注册中心故障**
+```
+问题：Zookeeper 集群故障，服务调用失败
+排查：
+Consumer 每次调用都查询注册中心，导致无法发现服务
+
+解决：
+1. Dubbo 默认会缓存 provider 列表到本地
+2. 配置缓存策略
+   <dubbo:registry address="zookeeper://127.0.0.1:2181"
+                  file="${user.home}/output/dubbo.cache"/>
+3. 注册中心故障时，使用本地缓存
+
+结果：注册中心故障不影响已有服务调用
+```
+
+#### **场景 3：序列化兼容性问题**
+```
+问题：升级服务版本后，旧客户端调用失败
+原因：
+- 新增字段使用了不可序列化的类型
+- 客户端版本不兼容
+
+解决：
+1. Protobuf 默认兼容（新增字段不影响）
+2. Hessian 需要保证序列化 ID 一致
+3. 使用版本号区分服务
+   <dubbo:service interface="..." version="1.0.0"/>
+   <dubbo:service interface="..." version="2.0.0"/>
+4. 灰度升级，逐步切换流量
+
+结果：平滑升级，零停机
+```
+
+---
+
+### 10. 阿里 P7 加分项
+
+**架构设计能力**：
+- 设计过大规模 RPC 框架的集群架构（百万级 QPS）
+- 有自定义 RPC 框架开发经验
+- 实现过服务网格与传统 RPC 框架的融合
+
+**深度理解**：
+- 熟悉 Dubbo 源码（SPI 机制、代理设计、集群容错）
+- 理解 gRPC 的 HTTP/2 和 Protobuf 底层原理
+- 有序列化协议的选型和优化经验
+
+**性能调优**：
+- 优化过 TCP 参数（连接池、KeepAlive、缓冲区大小）
+- 调整过 JVM 参数减少 GC（减少对象创建、使用堆外内存）
+- 优化过网络参数（MTU、TCP_NODELAY）
+
+**生产实践**：
+- 解决过 TCP 粘包/拆包问题
+- 处理过序列化安全漏洞（如 Hessian 反序列化 RCE）
+- 实现过服务优雅上下线（注册预热、优雅停机）
+
+**可观测性**：
+- 集成过分布式追踪（SkyWalking、Jaeger）
+- 实现过 RPC 调用链路监控
+- 设计过服务性能指标大盘（QPS、延迟、成功率）
+
+**跨语言调用**：
+- 有 gRPC 多语言实现经验（Java、Go、Python）
+- 解决过 Protobuf 跨语言兼容性问题
+- 实现过动态代理生成（如 Python 调用 Java 服务）
diff --git a/questions/service-mesh.md b/questions/service-mesh.md
new file mode 100644
index 0000000..b86008b
--- /dev/null
+++ b/questions/service-mesh.md
@@ -0,0 +1,605 @@
+# 服务网格 (Service Mesh)
+
+## 问题
+
+**背景**：在微服务架构中，随着服务数量增加，服务间的通信管理变得复杂。服务网格作为基础设施层，负责处理服务间通信的流量管理、安全性和可观测性。
+
+**问题**：
+1. 什么是服务网格？它解决了哪些问题？
+2. Istio 的核心组件有哪些？它们是如何协作的？
+3. Sidecar 模式的优缺点是什么？
+4. 请描述 Istio 的流量管理功能（灰度发布、蓝绿部署、熔断降级）
+5. Istio 如何实现 mTLS（双向 TLS）？
+6. 在生产环境中使用服务网格遇到过哪些坑？
+7. Linkerd 和 Istio 的区别是什么？如何选择？
+
+---
+
+## 标准答案
+
+### 1. 服务网格概述
+
+**定义**：
+服务网格是微服务架构中用于处理服务间通信的基础设施层，通常以轻量级网络代理的形式实现。
+
+**核心功能**：
+- **流量管理**：路由规则、负载均衡、灰度发布
+- **安全性**：mTLS、JWT 验证、访问控制
+- **可观测性**：Metrics、Tracing、Logging
+
+**解决的问题**：
+```
+传统微服务架构的痛点：
+├─ 服务间通信逻辑散落在每个服务中
+├─ 熔断、重试、超时等逻辑重复实现
+├─ 安全策略难以统一管理
+├─ 可观测性数据收集困难
+└─ 灰度发布、流量染色需要大量代码
+
+服务网格的解决方案：
+├─ 将通信逻辑下沉到 Sidecar 代理
+├─ 控制平面统一配置管理
+├─ 数据平面透明代理流量
+├─ 自动收集可观测性数据
+└─ 声明式 API 管理流量
+```
+
+---
+
+### 2. Istio 核心组件
+
+#### **架构图**：
+```
+                    ┌─────────────────┐
+                    │   Control Plane │
+                    └─────────────────┘
+                           │
+         ┌─────────────────┼─────────────────┐
+         │                 │                 │
+    ┌─────────┐      ┌──────────┐      ┌──────────┐
+    │ Istiod  │      │Pilot     │      │Citadel   │
+    │ (统一)  │      │(流量管理) │      │(证书管理)│
+    └─────────┘      └──────────┘      └──────────┘
+         │
+         │ 配置下发
+         │
+    ┌─────────────────────────────────────┐
+    │           Data Plane                │
+    ├─────────────────────────────────────┤
+    │                                     │
+    │  Service A          Service B       │
+    │  ┌─────────┐      ┌─────────┐      │
+    │  │ Envoy   │◄────►│ Envoy   │      │
+    │  │ Sidecar │      │ Sidecar │      │
+    │  └─────────┘      └─────────┘      │
+    │                                     │
+    └─────────────────────────────────────┘
+```
+
+#### **核心组件详解**：
+
+**1. Istiod（统一控制平面）**
+- **Pilot**：流量管理和配置下发
+- **Citadel**：证书管理和身份认证
+- **Galley**：配置验证和注入（Istio 1.13+ 已合并到 Istiod）
+
+**代码示例 - Istiod 配置**：
+```yaml
+# istiod deployment
+apiVersion: v1
+kind: Deployment
+metadata:
+  name: istiod
+  namespace: istio-system
+spec:
+  template:
+    spec:
+      containers:
+      - name: discovery
+        image: gcr.io/istio-testing/pilot:1.19.0
+        args:
+          - "discovery"
+          - "--monitoringAddr=:15014"
+          - "--log_output_level=default:info"
+        ports:
+        - containerPort: 15012 # Pilot 服务的 xDS 端口
+          name: grpc-xds
+```
+
+**2. Envoy Sidecar（数据平面）**
+- 拦截所有进出流量
+- 执行流量规则（路由、负载均衡）
+- 收集 Metrics 和 Traces
+- 处理 mTLS 加解密
+
+**Sidecar 注入示例**：
+```yaml
+# 自动注入 Sidecar
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app
+  annotations:
+    sidecar.istio.io/inject: "true"  # 启用自动注入
+spec:
+  template:
+    metadata:
+      annotations:
+        sidecar.istio.io/rewriteAppHTTPProbers: "true"  # 重写 HTTP probes
+    spec:
+      containers:
+      - name: app
+        image: my-app:1.0.0
+        ports:
+        - containerPort: 8080
+```
+
+---
+
+### 3. Sidecar 模式
+
+#### **优点**：
+1. **透明性**：业务代码无感知，无需修改
+2. **语言无关**：任何语言都能使用
+3. **统一管理**：集中配置，易于维护
+4. **渐进式采用**：可以逐步迁移
+
+#### **缺点**：
+1. **资源开销**：每个服务都有 Sidecar，增加内存和 CPU
+   ```
+   典型资源占用：
+   - 内存：50-100MB per Sidecar
+   - CPU：5-10% per core
+   - 延迟增加：1-5ms
+   ```
+
+2. **网络链路增加**：
+   ```
+   请求路径（有 Sidecar）：
+   Client → Sidecar A → Service A → Sidecar B → Service B
+
+   请求路径（无 Sidecar）：
+   Client → Service A → Service B
+   ```
+
+3. **调试复杂度**：多了一层网络代理
+
+#### **优化方案**：
+```yaml
+# Sidecar 资源限制
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app
+spec:
+  containers:
+  - name: istio-proxy
+    resources:
+      requests:
+        cpu: 50m
+        memory: 64Mi
+      limits:
+        cpu: 500m
+        memory: 256Mi
+```
+
+**Sidecar 资源配置模式**：
+```yaml
+# sidecar resources customization
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: istio-sidecar-injector
+data:
+  values: |
+    sidecarResources:
+      requests:
+        cpu: 50m
+        memory: 64Mi
+      limits:
+        cpu: 500m
+        memory: 256Mi
+```
+
+---
+
+### 4. 流量管理
+
+#### **4.1 灰度发布 (Canary Deployment)**
+
+**场景**：新版本 v2 发布给 10% 的流量
+
+```yaml
+apiVersion: networking.istio.io/v1beta1
+kind: VirtualService
+metadata:
+  name: reviews
+spec:
+  hosts:
+  - reviews
+  http:
+  - match:
+    - headers:
+        x-canary:
+          exact: "true"  # 带特定 header 的流量走 v2
+    route:
+    - destination:
+        host: reviews
+        subset: v2
+  - route:
+    - destination:
+        host: reviews
+        subset: v1
+      weight: 90  # 90% 流量走 v1
+    - destination:
+        host: reviews
+        subset: v2
+      weight: 10  # 10% 流量走 v2
+---
+apiVersion: networking.istio.io/v1beta1
+kind: DestinationRule
+metadata:
+  name: reviews
+spec:
+  host: reviews
+  subsets:
+  - name: v1
+    labels:
+      version: v1
+  - name: v2
+    labels:
+      version: v2
+```
+
+#### **4.2 蓝绿部署 (Blue-Green Deployment)**
+
+**场景**：一键切换全部流量到新版本
+
+```yaml
+apiVersion: networking.istio.io/v1beta1
+kind: VirtualService
+metadata:
+  name: my-app
+spec:
+  hosts:
+  - my-app
+  http:
+  - route:
+    - destination:
+        host: my-app
+        subset: blue  # 所有流量指向 blue
+      weight: 100
+    # 切换到 green：修改 subset 为 green
+---
+# Kubernetes Deployment：同时存在 blue 和 green
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app-blue
+spec:
+  template:
+    metadata:
+      labels:
+        version: blue
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app-green
+spec:
+  template:
+    metadata:
+      labels:
+        version: green
+```
+
+#### **4.3 熔断降级 (Circuit Breaker)**
+
+**场景**：防止故障扩散
+
+```yaml
+apiVersion: networking.istio.io/v1beta1
+kind: DestinationRule
+metadata:
+  name: httpbin
+spec:
+  host: httpbin
+  trafficPolicy:
+    connectionPool:
+      tcp:
+        maxConnections: 10  # 最大连接数
+      http:
+        http1MaxPendingRequests: 50  # 最大等待请求数
+        http2MaxRequests: 100  # 最大并发请求数
+        maxRequestsPerConnection: 2  # 每连接最大请求数
+        maxRetries: 3  # 最大重试次数
+    outlierDetection:
+      consecutiveErrors: 5  # 连续 5 次错误
+      interval: 30s  # 每 30s 检查一次
+      baseEjectionTime: 30s  # 最小熔断时间
+      maxEjectionPercent: 50  # 最多熔断 50% 的实例
+      minHealthPercent: 40  # 最小健康实例比例
+```
+
+**熔断状态图**：
+```
+Closed → Open → Half-Open → Closed
+   ↑        │          │
+   └────────┴──────────┘
+```
+
+#### **4.4 超时和重试**
+
+```yaml
+apiVersion: networking.istio.io/v1beta1
+kind: VirtualService
+metadata:
+  name: reviews
+spec:
+  hosts:
+  - reviews
+  http:
+  - retry:
+      attempts: 3  # 最多重试 3 次
+      perTryTimeout: 2s  # 每次重试超时 2s
+      retryOn: 5xx,connect-failure,refused-stream  # 重试条件
+    timeout: 10s  # 总超时时间
+    route:
+    - destination:
+        host: reviews
+```
+
+---
+
+### 5. mTLS (双向 TLS) 实现
+
+#### **原理**：
+```
+Service A                              Service B
+    │                                      │
+    │ 1. 发送连接请求（无证书）              │
+    │ ─────────────────────────────────────►│
+    │                                      │
+    │ 2. 返回服务器证书                      │
+    │ ◄─────────────────────────────────────│
+    │                                      │
+    │ 3. 发送客户端证书                      │
+    │ ─────────────────────────────────────►│
+    │                                      │
+    │ 4. 验证通过，建立加密连接               │
+    │ ◄────────────────────────────────────►│
+    │                                      │
+    │ 5. 加密通信                           │
+    │ ◄──────────────►                     │
+```
+
+#### **配置示例**：
+
+**全局启用 mTLS**：
+```yaml
+apiVersion: security.istio.io/v1beta1
+kind: PeerAuthentication
+metadata:
+  name: default
+  namespace: istio-system
+spec:
+  mtls:
+    mode: STRICT  # 严格模式：必须使用 mTLS
+```
+
+**按服务配置**：
+```yaml
+apiVersion: security.istio.io/v1beta1
+kind: PeerAuthentication
+metadata:
+  name: my-app-mtls
+  namespace: default
+spec:
+  selector:
+    matchLabels:
+      app: my-app
+  mtls:
+    mode: PERMISSIVE  # 宽松模式：兼容 mTLS 和明文
+```
+
+**服务授权**：
+```yaml
+apiVersion: security.istio.io/v1beta1
+kind: AuthorizationPolicy
+metadata:
+  name: my-app-authz
+spec:
+  selector:
+    matchLabels:
+      app: my-app
+  action: ALLOW
+  rules:
+  - from:
+    - source:
+        principals: ["cluster.local/ns/default/sa/frontend"]  # 只允许 frontend SA 访问
+    to:
+    - operation:
+        methods: ["GET", "POST"]
+```
+
+#### **证书管理流程**：
+```
+1. Citadel 工作负载证书
+   ↓
+2. 证书存储在 Secret 中
+   ↓
+3. Envoy Sidecar 启动时加载证书
+   ↓
+4. 定期轮换证书（默认 24 小时）
+   ↓
+5. 旧证书过期，使用新证书
+```
+
+---
+
+### 6. 生产环境踩坑经验
+
+#### **坑 1：Sidecar 资源占用过高**
+```yaml
+# 问题：100 个服务 × 100MB = 10GB 内存
+# 解决：按需启用 Sidecar
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app
+  annotations:
+    sidecar.istio.io/inject: "false"  # 禁用 Sidecar
+```
+
+#### **坑 2：网络延迟增加**
+```
+问题：请求延迟从 5ms 增加到 10ms
+原因：
+- Sidecar 增加了一跳
+- mTLS 加解密开销
+
+解决：
+1. 调整 Envoy 配置，减少日志级别
+2. 使用 PERMISSIVE 模式降级
+3. 增加超时时间配置
+```
+
+#### **坑 3：配置下发延迟**
+```yaml
+# 问题：修改 VirtualService 后，流量未立即切换
+# 原因：Pilot 下发配置有延迟（默认 1s）
+
+# 解决：减少配置刷新间隔
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: istio
+  namespace: istio-system
+data:
+  mesh: |-
+    defaultConfig:
+      proxyStatsMatcher:
+        inclusionRegexps:
+        - ".*"  # 收集所有指标
+      discoveryRefreshDelay: 1s  # 配置刷新延迟
+    ```
+```
+
+#### **坑 4：大规模性能问题**
+```
+问题：集群 1000+ 服务时，Istiod 性能瓶颈
+解决：
+1. 部署多个 Istiod 实例
+2. 使用 Namespace 隔离配置
+3. 启用配置压缩
+```
+
+**多实例部署**：
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: istiod
+spec:
+  replicas: 3  # 多副本
+```
+
+---
+
+### 7. Linkerd vs Istio
+
+#### **对比表**：
+
+| 特性 | Linkerd 2.x | Istio |
+|------|-------------|-------|
+| 代理 | Rust 实现（Linkerd2-proxy） | Envoy（C++） |
+| 性能 | 更低资源占用 | 资源占用较高 |
+| 功能 | 聚焦核心功能 | 功能更丰富 |
+| 集成 | Kubernetes 原生 | 支持多平台 |
+| 学习曲线 | 简单 | 复杂 |
+| 社区 | CNCF 毕业项目 | CNCF 孵化中 |
+| 多集群支持 | 弱 | 强 |
+| 流量管理 | 基础 | 高级（灰度、蓝绿等） |
+
+#### **选择建议**：
+
+**选择 Linkerd**：
+- Kubernetes 原生环境
+- 重视性能和资源占用
+- 需要简单易用的解决方案
+- 功能要求不高
+
+**选择 Istio**：
+- 需要高级流量管理
+- 多集群/多云环境
+- 需要细粒度的安全控制
+- 团队有运维能力
+
+---
+
+### 8. 实际项目经验
+
+#### **场景 1：电商系统灰度发布**
+```
+需求：新支付系统先给 5% 用户试用
+方案：
+1. 部署 payment-v2
+2. 配置 VirtualService，5% 流量走 v2
+3. 监控错误率和延迟
+4. 逐步增加流量比例：5% → 20% → 50% → 100%
+5. 出现问题立即回滚
+```
+
+#### **场景 2：金融系统 mTLS 合规**
+```
+需求：所有服务间通信必须加密
+方案：
+1. 启用 STRICT mTLS 模式
+2. 配置 AuthorizationPolicy，只允许合法的 SA 访问
+3. 定期轮换证书（24 小时）
+4. 审计日志记录所有通信
+```
+
+#### **场景 3：多集群容灾**
+```
+需求：主集群故障时自动切换到备用集群
+方案：
+1. 使用 Multi-Cluster Mesh
+2. 配置 ServiceEntry，指向备用集群
+3. 配置 DestinationRule，故障时自动切换
+4. 跨集群流量加密
+```
+
+---
+
+### 9. 阿里 P7 加分项
+
+**架构设计能力**：
+- 设计过大规模服务网格架构（500+ 服务）
+- 有多集群/多云服务网格实施经验
+- 实现过自定义 Control Plane（如基于 Istio API）
+
+**深度理解**：
+- 理解 Envoy 内部机制（过滤器链、连接池、HTTP/2）
+- 熟悉 xDS 协议（CDS, EDS, LDS, RDS）
+- 有性能调优经验（减少延迟、优化资源占用）
+
+**实际项目**：
+- 主导过从传统架构迁移到服务网格
+- 解决过生产环境的疑难问题（如网络分区、证书轮换故障）
+- 开发过自定义 WASM 插件扩展 Envoy 功能
+
+**开源贡献**：
+- 向 Istio/Envoy 社区提交过 PR
+- 解决过社区 Issue
+- 编写过相关技术博客或演讲
+
+**监控和可观测性**：
+- 设计过服务网格监控体系
+- 使用 Prometheus/Grafana 监控 Sidecar 性能
+- 实现过分布式追踪集成（Jaeger/Zipkin）
+
+**安全实践**：
+- 实现过零信任网络架构
+- 有安全审计和合规经验
+- 设计过细粒度的 RBAC 策略