feat: add macOS screen snapshots for monitor preview (#67954) thanks @BunsDev

Co-authored-by: Val Alexander <68980965+BunsDev@users.noreply.github.com>
This commit is contained in:
Val Alexander
2026-04-17 02:58:21 -05:00
committed by GitHub
parent 0b6c39be18
commit f377db1015
12 changed files with 321 additions and 1 deletions

View File

@@ -4,6 +4,10 @@ Docs: https://docs.openclaw.ai
## Unreleased
### Changes
- macOS/gateway: add `screen.snapshot` support for macOS app nodes, including runtime plumbing, default macOS allowlisting, and docs for monitor preview flows. (#67954) Thanks @BunsDev.
### Fixes
- Onboarding/non-interactive: preserve existing gateway auth tokens during re-onboard so active local gateway clients are not disconnected by an implicit token rotation. (#67821) Thanks @BKF-Gitty.

View File

@@ -146,6 +146,7 @@ final class MacNodeModeCoordinator {
OpenClawCanvasA2UICommand.push.rawValue,
OpenClawCanvasA2UICommand.pushJSONL.rawValue,
OpenClawCanvasA2UICommand.reset.rawValue,
MacNodeScreenCommand.snapshot.rawValue,
MacNodeScreenCommand.record.rawValue,
OpenClawSystemCommand.notify.rawValue,
OpenClawSystemCommand.which.rawValue,

View File

@@ -63,6 +63,8 @@ actor MacNodeRuntime {
return try await self.handleCameraInvoke(req)
case OpenClawLocationCommand.get.rawValue:
return try await self.handleLocationInvoke(req)
case MacNodeScreenCommand.snapshot.rawValue:
return try await self.handleScreenSnapshotInvoke(req)
case MacNodeScreenCommand.record.rawValue:
return try await self.handleScreenRecordInvoke(req)
case OpenClawSystemCommand.run.rawValue:
@@ -352,6 +354,34 @@ actor MacNodeRuntime {
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
}
private func handleScreenSnapshotInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse {
let params = (try? Self.decodeParams(MacNodeScreenSnapshotParams.self, from: req.paramsJSON)) ??
MacNodeScreenSnapshotParams()
let services = await self.mainActorServices()
let capturedAtMs = Int64(Date().timeIntervalSince1970 * 1000)
let res = try await services.snapshotScreen(
screenIndex: params.screenIndex,
maxWidth: params.maxWidth,
quality: params.quality,
format: params.format)
struct ScreenSnapshotPayload: Encodable {
var format: String
var base64: String
var width: Int
var height: Int
var screenIndex: Int?
var capturedAtMs: Int64
}
let payload = try Self.encodePayload(ScreenSnapshotPayload(
format: res.format.rawValue,
base64: res.data.base64EncodedString(),
width: res.width,
height: res.height,
screenIndex: params.screenIndex,
capturedAtMs: capturedAtMs))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
}
private func mainActorServices() async -> any MacNodeRuntimeMainActorServices {
if let cachedMainActorServices { return cachedMainActorServices }
let services = await self.makeMainActorServices()

View File

@@ -4,6 +4,13 @@ import OpenClawKit
@MainActor
protocol MacNodeRuntimeMainActorServices: Sendable {
func snapshotScreen(
screenIndex: Int?,
maxWidth: Int?,
quality: Double?,
format: OpenClawScreenSnapshotFormat?) async throws
-> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int)
func recordScreen(
screenIndex: Int?,
durationMs: Int?,
@@ -21,9 +28,24 @@ protocol MacNodeRuntimeMainActorServices: Sendable {
@MainActor
final class LiveMacNodeRuntimeMainActorServices: MacNodeRuntimeMainActorServices, @unchecked Sendable {
private let screenSnapshotter = ScreenSnapshotService()
private let screenRecorder = ScreenRecordService()
private let locationService = MacNodeLocationService()
func snapshotScreen(
screenIndex: Int?,
maxWidth: Int?,
quality: Double?,
format: OpenClawScreenSnapshotFormat?) async throws
-> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int)
{
try await self.screenSnapshotter.snapshot(
screenIndex: screenIndex,
maxWidth: maxWidth,
quality: quality,
format: format)
}
func recordScreen(
screenIndex: Int?,
durationMs: Int?,

View File

@@ -1,9 +1,18 @@
import Foundation
import OpenClawKit
enum MacNodeScreenCommand: String, Codable {
case snapshot = "screen.snapshot"
case record = "screen.record"
}
struct MacNodeScreenSnapshotParams: Codable, Equatable {
var screenIndex: Int?
var maxWidth: Int?
var quality: Double?
var format: OpenClawScreenSnapshotFormat?
}
struct MacNodeScreenRecordParams: Codable, Equatable {
var screenIndex: Int?
var durationMs: Int?

View File

@@ -0,0 +1,109 @@
import AppKit
import Foundation
import OpenClawKit
@preconcurrency import ScreenCaptureKit
@MainActor
final class ScreenSnapshotService {
enum ScreenSnapshotError: LocalizedError {
case noDisplays
case invalidScreenIndex(Int)
case captureFailed(String)
case encodeFailed(String)
var errorDescription: String? {
switch self {
case .noDisplays:
"No displays available for screen snapshot"
case let .invalidScreenIndex(idx):
"Invalid screen index \(idx)"
case let .captureFailed(message):
message
case let .encodeFailed(message):
message
}
}
}
func snapshot(
screenIndex: Int?,
maxWidth: Int?,
quality: Double?,
format: OpenClawScreenSnapshotFormat?) async throws
-> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int)
{
let format = format ?? .jpeg
let normalized = Self.normalize(maxWidth: maxWidth, quality: quality, format: format)
let content = try await SCShareableContent.current
let displays = content.displays.sorted { $0.displayID < $1.displayID }
guard !displays.isEmpty else {
throw ScreenSnapshotError.noDisplays
}
let idx = screenIndex ?? 0
guard idx >= 0, idx < displays.count else {
throw ScreenSnapshotError.invalidScreenIndex(idx)
}
let display = displays[idx]
let filter = SCContentFilter(display: display, excludingWindows: [])
let config = SCStreamConfiguration()
let targetSize = Self.targetSize(
width: display.width,
height: display.height,
maxWidth: normalized.maxWidth)
config.width = targetSize.width
config.height = targetSize.height
config.showsCursor = true
let cgImage: CGImage
do {
cgImage = try await SCScreenshotManager.captureImage(
contentFilter: filter,
configuration: config)
} catch {
throw ScreenSnapshotError.captureFailed(error.localizedDescription)
}
let bitmap = NSBitmapImageRep(cgImage: cgImage)
let data: Data
switch format {
case .png:
guard let encoded = bitmap.representation(using: .png, properties: [:]) else {
throw ScreenSnapshotError.encodeFailed("png encode failed")
}
data = encoded
case .jpeg:
guard let encoded = bitmap.representation(
using: .jpeg,
properties: [.compressionFactor: normalized.quality])
else {
throw ScreenSnapshotError.encodeFailed("jpeg encode failed")
}
data = encoded
}
return (data: data, format: format, width: cgImage.width, height: cgImage.height)
}
private static func normalize(
maxWidth: Int?,
quality: Double?,
format: OpenClawScreenSnapshotFormat)
-> (maxWidth: Int, quality: Double)
{
let resolvedMaxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil } ?? (format == .png ? 900 : 1600)
let resolvedQuality = min(1.0, max(0.05, quality ?? 0.72))
return (maxWidth: resolvedMaxWidth, quality: resolvedQuality)
}
private static func targetSize(width: Int, height: Int, maxWidth: Int) -> (width: Int, height: Int) {
guard width > 0, height > 0, width > maxWidth else {
return (width: width, height: height)
}
let scale = Double(maxWidth) / Double(width)
let targetHeight = max(1, Int((Double(height) * scale).rounded()))
return (width: maxWidth, height: targetHeight)
}
}

View File

@@ -78,6 +78,19 @@ struct MacNodeRuntimeTests {
@Test func `handle invoke screen record uses injected services`() async throws {
@MainActor
final class FakeMainActorServices: MacNodeRuntimeMainActorServices, @unchecked Sendable {
func snapshotScreen(
screenIndex: Int?,
maxWidth: Int?,
quality: Double?,
format: OpenClawScreenSnapshotFormat?) async throws
-> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int)
{
_ = screenIndex
_ = maxWidth
_ = quality
return (Data("snapshot".utf8), format ?? .jpeg, 640, 360)
}
func recordScreen(
screenIndex: Int?,
durationMs: Int?,
@@ -127,6 +140,94 @@ struct MacNodeRuntimeTests {
#expect(!payload.base64.isEmpty)
}
@Test func `handle invoke screen snapshot uses injected services`() async throws {
@MainActor
final class FakeMainActorServices: MacNodeRuntimeMainActorServices, @unchecked Sendable {
var snapshotCalledAtMs: Int64?
func snapshotScreen(
screenIndex: Int?,
maxWidth: Int?,
quality: Double?,
format: OpenClawScreenSnapshotFormat?) async throws
-> (data: Data, format: OpenClawScreenSnapshotFormat, width: Int, height: Int)
{
self.snapshotCalledAtMs = Int64(Date().timeIntervalSince1970 * 1000)
#expect(screenIndex == 0)
#expect(maxWidth == 800)
#expect(quality == 0.5)
return (Data("ok".utf8), format ?? .jpeg, 800, 450)
}
func recordScreen(
screenIndex: Int?,
durationMs: Int?,
fps: Double?,
includeAudio: Bool?,
outPath: String?) async throws -> (path: String, hasAudio: Bool)
{
let url = FileManager().temporaryDirectory
.appendingPathComponent("openclaw-test-screen-record-\(UUID().uuidString).mp4")
try Data("ok".utf8).write(to: url)
return (path: url.path, hasAudio: false)
}
func locationAuthorizationStatus() -> CLAuthorizationStatus {
.authorizedAlways
}
func locationAccuracyAuthorization() -> CLAccuracyAuthorization {
.fullAccuracy
}
func currentLocation(
desiredAccuracy: OpenClawLocationAccuracy,
maxAgeMs: Int?,
timeoutMs: Int?) async throws -> CLLocation
{
_ = desiredAccuracy
_ = maxAgeMs
_ = timeoutMs
return CLLocation(latitude: 0, longitude: 0)
}
}
let services = await MainActor.run { FakeMainActorServices() }
let runtime = MacNodeRuntime(makeMainActorServices: { services })
let params = MacNodeScreenSnapshotParams(
screenIndex: 0,
maxWidth: 800,
quality: 0.5,
format: .jpeg)
let json = try String(data: JSONEncoder().encode(params), encoding: .utf8)
let response = await runtime.handleInvoke(
BridgeInvokeRequest(
id: "req-screen-snapshot",
command: MacNodeScreenCommand.snapshot.rawValue,
paramsJSON: json))
#expect(response.ok == true)
let payloadJSON = try #require(response.payloadJSON)
struct Payload: Decodable {
var format: String
var base64: String
var width: Int
var height: Int
var capturedAtMs: Int64
}
let payload = try JSONDecoder().decode(Payload.self, from: Data(payloadJSON.utf8))
#expect(payload.format == "jpeg")
#expect(payload.base64 == Data("ok".utf8).base64EncodedString())
#expect(payload.width == 800)
#expect(payload.height == 450)
#expect(payload.capturedAtMs > 0)
let snapshotCalledAtMs = await MainActor.run { services.snapshotCalledAtMs }
#expect(snapshotCalledAtMs != nil)
#expect(payload.capturedAtMs <= snapshotCalledAtMs!)
}
@Test func `handle invoke browser proxy uses injected request`() async {
let runtime = MacNodeRuntime(browserProxyRequest: { paramsJSON in
#expect(paramsJSON?.contains("/tabs") == true)

View File

@@ -1,9 +1,34 @@
import Foundation
public enum OpenClawScreenCommand: String, Codable, Sendable {
case snapshot = "screen.snapshot"
case record = "screen.record"
}
public enum OpenClawScreenSnapshotFormat: String, Codable, Sendable {
case jpeg
case png
}
public struct OpenClawScreenSnapshotParams: Codable, Sendable, Equatable {
public var screenIndex: Int?
public var maxWidth: Int?
public var quality: Double?
public var format: OpenClawScreenSnapshotFormat?
public init(
screenIndex: Int? = nil,
maxWidth: Int? = nil,
quality: Double? = nil,
format: OpenClawScreenSnapshotFormat? = nil)
{
self.screenIndex = screenIndex
self.maxWidth = maxWidth
self.quality = quality
self.format = format
}
}
public struct OpenClawScreenRecordParams: Codable, Sendable, Equatable {
public var screenIndex: Int?
public var durationMs: Int?

View File

@@ -55,7 +55,7 @@ The macOS app presents itself as a node. Common commands:
- Canvas: `canvas.present`, `canvas.navigate`, `canvas.eval`, `canvas.snapshot`, `canvas.a2ui.*`
- Camera: `camera.snap`, `camera.clip`
- Screen: `screen.record`
- Screen: `screen.snapshot`, `screen.record`
- System: `system.run`, `system.notify`
The node reports a `permissions` map so agents can decide whats allowed.

View File

@@ -367,6 +367,21 @@ describe("resolveNodeCommandAllowlist", () => {
expect(DEFAULT_DANGEROUS_NODE_COMMANDS).toContain("sms.search");
});
it("allows macOS screen.snapshot by default but keeps screen.record gated", () => {
const allow = resolveNodeCommandAllowlist(
{},
{
platform: "macOS 26.3.1",
deviceFamily: "Mac",
},
);
expect(DEFAULT_DANGEROUS_NODE_COMMANDS).not.toContain("screen.snapshot");
expect(DEFAULT_DANGEROUS_NODE_COMMANDS).toContain("screen.record");
expect(allow.has("screen.snapshot")).toBe(true);
expect(allow.has("screen.record")).toBe(false);
});
it("can explicitly allow dangerous commands via allowCommands", () => {
const allow = resolveNodeCommandAllowlist(
{

View File

@@ -21,6 +21,7 @@ const CANVAS_COMMANDS = [
const CAMERA_COMMANDS = ["camera.list"];
const CAMERA_DANGEROUS_COMMANDS = ["camera.snap", "camera.clip"];
const SCREEN_COMMANDS = ["screen.snapshot"];
const SCREEN_DANGEROUS_COMMANDS = ["screen.record"];
const LOCATION_COMMANDS = ["location.get"];
@@ -111,6 +112,7 @@ const PLATFORM_DEFAULTS: Record<string, string[]> = {
...PHOTOS_COMMANDS,
...MOTION_COMMANDS,
...SYSTEM_COMMANDS,
...SCREEN_COMMANDS,
],
linux: [...SYSTEM_COMMANDS],
windows: [...SYSTEM_COMMANDS],

View File

@@ -95,6 +95,8 @@ describe("configureGatewayForSetup", () => {
expect(result.settings.gatewayToken).toBe("generated-token");
expect(result.nextConfig.gateway?.nodes?.denyCommands).toEqual(DEFAULT_DANGEROUS_NODE_COMMANDS);
expect(result.nextConfig.gateway?.nodes?.denyCommands).not.toContain("screen.snapshot");
expect(result.nextConfig.gateway?.nodes?.denyCommands).toContain("screen.record");
});
it("prefers OPENCLAW_GATEWAY_TOKEN during quickstart token setup", async () => {