More work on early system setup init containers

This commit is contained in:
Garrett Mills 2025-04-22 10:03:57 -04:00
parent 1fa2fd3d95
commit 2f28459f9a
9 changed files with 167 additions and 17 deletions

44
Cargo.lock generated
View File

@ -908,12 +908,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.9"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
checksum = "976dd42dc7e85965fe702eb8164f21f450704bdde31faefd6471dba214cb594e"
dependencies = [
"libc",
"windows-sys 0.52.0",
"windows-sys 0.59.0",
]
[[package]]
@ -1171,6 +1171,16 @@ dependencies = [
"zeroize",
]
[[package]]
name = "gethostname"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed7131e57abbde63513e0e6636f76668a1ca9798dcae2df4e283cae9ee83859e"
dependencies = [
"rustix 1.0.5",
"windows-targets 0.52.6",
]
[[package]]
name = "getrandom"
version = "0.2.15"
@ -1850,9 +1860,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.161"
version = "0.2.172"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
[[package]]
name = "libm"
@ -1903,6 +1913,12 @@ version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "linux-raw-sys"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
[[package]]
name = "lock_api"
version = "0.4.12"
@ -2275,6 +2291,7 @@ dependencies = [
"dotenv",
"env_logger",
"futures",
"gethostname",
"k8s-openapi",
"kube",
"log",
@ -2985,10 +3002,23 @@ dependencies = [
"bitflags 2.6.0",
"errno",
"libc",
"linux-raw-sys",
"linux-raw-sys 0.4.14",
"windows-sys 0.52.0",
]
[[package]]
name = "rustix"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf"
dependencies = [
"bitflags 2.6.0",
"errno",
"libc",
"linux-raw-sys 0.9.4",
"windows-sys 0.59.0",
]
[[package]]
name = "rustls"
version = "0.23.25"
@ -3944,7 +3974,7 @@ dependencies = [
"cfg-if",
"fastrand",
"once_cell",
"rustix",
"rustix 0.38.38",
"windows-sys 0.59.0",
]

View File

@ -24,3 +24,4 @@ ssh-key = { version = "0.6.7", features = ["ed25519"] }
rand = "0.8.5"
kube = { version = "0.99.0", features = ["runtime", "derive"] }
k8s-openapi = { version = "0.24", features = ["v1_32"] }
gethostname = "1.0.1"

View File

@ -0,0 +1,77 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: p5x-api-sa
namespace: p5x-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: p5x-api-configmap-role
namespace: p5x-system
rules:
- apiGroups: [""]
resources: ["configmaps"]
resourceNames: ["dynamic-kv"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: p5x-api-configmap-binding
namespace: p5x-system
subjects:
- kind: ServiceAccount
name: p5x-api-sa
namespace: p5x-system
roleRef:
kind: Role
name: p5x-api-configmap-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: pod-self-read-role
namespace: p5x-system
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: pod-self-read-binding
namespace: p5x-system
subjects:
- kind: ServiceAccount
name: p5x-api-sa
namespace: p5x-system
roleRef:
kind: Role
name: pod-self-read-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: node-label-read-role
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: node-label-read-binding
subjects:
- kind: ServiceAccount
name: p5x-api-sa
namespace: p5x-system
roleRef:
kind: ClusterRole
name: node-label-read-role
apiGroup: rbac.authorization.k8s.io

View File

@ -19,11 +19,18 @@ spec:
app: p5x-api-server
spec:
priorityClassName: system-cluster-critical
# api-server interacts w/ some parts of the K8s API, so bind its service account
serviceAccountName: p5x-api-sa
volumes:
# Used for the api-server SQLite database et al
- name: p5x-system-data
persistentVolumeClaim:
claimName: system-data
initContainers:
# Make sure the p5x-system-data disk exists in PVE and is mounted
- name: ensure-system-disk
image: docker.io/glmdev/p5x-api:latest
envFrom:
@ -37,6 +44,22 @@ spec:
- name: RUST_LOG
value: 'p5x=debug,sqlx=warn,info'
command: ['/p5x/p5x', 'ensure-system-disk']
# Transfer the p5x-system-data PVE disk to the correct node, if necessary
- name: migrate-system-disk
image: docker.io/glmdev/p5x-api:latest
envFrom:
- secretRef:
name: api-env
env:
- name: P5X_NODE_HOSTNAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: RUST_LOG
value: 'p5x=debug,sqlx=warn,info'
command: ['/p5x/p5x', 'migrate-system-disk']
containers:
- name: api-server
image: docker.io/glmdev/p5x-api:latest

View File

@ -15,6 +15,7 @@ use proxmox_api::types::VmId;
use proxmox_api::UreqError;
use serde_json::json;
use tokio::time::sleep;
use gethostname::gethostname;
use crate::api::cluster::carrier::{provision_carrier_unmanaged, terminate_carrier_unmanaged};
use crate::api::cluster::node::migrate_node_unmanaged;
use crate::api::cluster::volume::create_volume_unmanaged;
@ -46,7 +47,9 @@ pub async fn ensure_system_disk(svc: &Services<'_>) -> Result<(), P5xError> {
info!(target: "p5x", "Provisioning new P5x API system disk (this is a one-time fixup)...");
// Load the labels for this pod's node
let pod_name = env::var("POD_NAME").expect("Could not determine POD_NAME from environment!");
let pod_name = env::var("POD_NAME")
.or_else(|_| gethostname().into_string())
.expect("Could not determine pod name from environment");
let pods: Api<Pod> = Api::namespaced(client.clone(), &namespace);
let pod = pods.get(&pod_name).await.map_err(P5xError::KubeError)?;
@ -72,7 +75,7 @@ pub async fn ensure_system_disk(svc: &Services<'_>) -> Result<(), P5xError> {
5 * 1024 * 1024 * 1024,
pve_host,
pve_id,
"p5x-api-system-disk"
"system-data"
).await?;
// Add it to the dynamic-kv config and save
@ -96,6 +99,7 @@ pub async fn migrate_system_disk_if_necessary(svc: &Services<'_>) -> Result<(),
// Load the dynamic-kv and get the current host/mount
let client = Client::try_default().await.map_err(P5xError::KubeError)?;
let namespace = fs::read_to_string("/var/run/secrets/kubernetes.io/serviceaccount/namespace")
.or_else(|_| env::var("P5X_OVERRIDE_NS"))
.unwrap_or_else(|_| "p5x-system".to_string());
let maps: Api<ConfigMap> = Api::namespaced(client.clone(), &namespace);
@ -107,7 +111,9 @@ pub async fn migrate_system_disk_if_necessary(svc: &Services<'_>) -> Result<(),
let current_pve_id: i32 = data.get("api-pve-id").expect("Could not find api-pve-id in dynamic-kv config").parse().unwrap();
// Load the labels for this pod's node
let pod_name = env::var("POD_NAME").expect("Could not determine POD_NAME from environment!");
let pod_name = env::var("POD_NAME")
.or_else(|_| gethostname().into_string())
.expect("Could not determine pod name from environment");
let pods: Api<Pod> = Api::namespaced(client.clone(), &namespace);
let pod = pods.get(&pod_name).await.map_err(P5xError::KubeError)?;

View File

@ -1,6 +1,7 @@
use async_trait::async_trait;
use rocket::{fairing, Build, Rocket};
use rocket::fairing::AdHoc;
use sea_orm::DatabaseConnection;
pub use sea_orm_migration::prelude::*;
use sea_orm_rocket::Database;
use crate::api::Db;
@ -30,6 +31,10 @@ async fn run_migrations(rocket: Rocket<Build>) -> fairing::Result {
Ok(rocket)
}
pub async fn run_migrations_unmanaged(conn: &DatabaseConnection) -> Result<(), DbErr> {
Migrator::up(conn, None).await
}
pub(super) fn init() -> AdHoc {
AdHoc::try_on_ignite("Applying migrations", run_migrations)
}

View File

@ -1,4 +1,4 @@
mod migrations;
pub mod migrations;
use std::time::Duration;
use rocket::figment::Figment;

View File

@ -1,6 +1,6 @@
use rocket::fairing::AdHoc;
mod db;
pub mod db;
mod route;
pub mod util;
pub mod cluster;

View File

@ -2,12 +2,13 @@ pub mod api;
#[macro_use] extern crate rocket;
use dotenv::dotenv;
use rocket::{Build, Rocket};
use log::{error, info};
use log::{error, info, debug};
use std::{env, process};
use sea_orm::Database;
use crate::api::cluster::system::{ensure_ssh_keypair, ensure_system_disk, migrate_system_disk_if_necessary};
use crate::api::services::Services;
use crate::api::util::read_p5x_config;
use crate::api::db::migrations::run_migrations_unmanaged;
fn configure_rocket() -> Rocket<Build> {
rocket::build()
@ -27,22 +28,29 @@ async fn main() {
}
let mode = &args[1];
debug!(target: "p5x", "Running with mode: {mode}");
// Intentionally generate this before migrating/ensuring the system disk, since that requires
// Services, and Services requires the SSH keys to exist. The keys generated during the system
// disk ops will be overwritten when the disk is mounted.
ensure_ssh_keypair().expect("Could not ensure SSH keypair exists.");
if mode == "ensure-system-disk" {
let anon_db = Database::connect("sqlite::memory:").await.unwrap();
let svc = Services::build(&anon_db).await.unwrap(); // fixme: this is going to fail because of the SSH keys
run_migrations_unmanaged(&anon_db).await.unwrap();
let svc = Services::build(&anon_db).await.unwrap();
ensure_system_disk(&svc).await.unwrap();
return;
}
if mode == "migrate-system-disk" {
let anon_db = Database::connect("sqlite::memory:").await.unwrap();
let svc = Services::build(&anon_db).await.unwrap(); // fixme: this is going to fail because of the SSH keys
run_migrations_unmanaged(&anon_db).await.unwrap();
let svc = Services::build(&anon_db).await.unwrap();
migrate_system_disk_if_necessary(&svc).await.unwrap();
return;
}
ensure_ssh_keypair().expect("Could not ensure SSH keypair exists.");
let config = read_p5x_config(); // Do this so we early-fail if there are missing env vars
info!(target: "p5x", "Successfully read config from environment.");
info!(target: "p5x", "Cluster host: {} ({})", config.pve_host_name, config.pve_api_host);