Building your own Operator

The OpenAI computer-use-preview model allows you to create AI agents that can control web browsers programmatically. This guide shows how to build an operator that can interact with web interfaces using natural language commands.
Python
from playwright.sync_api import sync_playwright
from openai import OpenAI
import time
import base64

client = OpenAI()
p = sync_playwright().start()
browser = p.chromium.launch(headless=False, chromium_sandbox=True)
page = browser.new_page()
page.set_viewport_size({"width": 1024, "height": 768})
page.goto("https://x.com")

def handle_model_action(page, action):
    action_type = action.type
    try:
        match action_type:
            case "click":
                button = "left" if action.button not in ["left", "right"] else action.button
                page.mouse.click(action.x, action.y, button=button)
            case "scroll":
                page.mouse.move(action.x, action.y)
                page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
            case "keypress":
                for k in action.keys:
                    key = {"enter": "Enter", "space": " "}.get(k.lower(), k)
                    page.keyboard.press(key)
            case "type":
                page.keyboard.type(action.text)
            case "wait":
                time.sleep(2)
            case "screenshot":
                pass
            case _:
                print(f"Unrecognized action: {action}")
    except Exception as e:
        print(f"Error handling action {action}: {e}")

def get_response(previous_response_id=None, human_input=None):
    base_params = {
        "model": "computer-use-preview",
        "tools": [{
            "type": "computer_use_preview",
            "display_width": 1024,
            "display_height": 768,
            "environment": "browser"
        }],
        "truncation": "auto"
    }
    
    if previous_response_id:
        base_params.update({
            "previous_response_id": previous_response_id,
            "reasoning": {"generate_summary": "concise"},
            "input": [{"role": "user", "content": human_input}]
        })
    else:
        base_params["input"] = [{
            "role": "user",
            "content": "Goto to x.com, ask the user to login, then summarize the latest tweets"
        }]
    
    return client.responses.create(**base_params)

def computer_use_loop(page, response):
    while True:
        computer_calls = [item for item in response.output if item.type == "computer_call"]
        if not computer_calls:
            return response

        call = computer_calls[0]
        handle_model_action(page, call.action)
        time.sleep(1)

        screenshot_base64 = base64.b64encode(page.screenshot()).decode("utf-8")
        response = client.responses.create(
            model="computer-use-preview",
            previous_response_id=response.id,
            tools=[{
                "type": "computer_use_preview",
                "display_width": 1024,
                "display_height": 768,
                "environment": "browser"
            }],
            input=[{
                "call_id": call.call_id,
                "type": "computer_call_output",
                "output": {
                    "type": "input_image",
                    "image_url": f"data:image/png;base64,{screenshot_base64}"
                }
            }],
            truncation="auto")

response = get_response()
while True:
    try:
        response = computer_use_loop(page, response)
        message = next(item for item in response.output if item.type == "message")
        print("\n" + "="*50)
        print("🤖 Assistant: " + message.content[0].text)
        print("-"*50)
        human_input = input("👤 Your response: ")
        response = get_response(response.id, human_input)
    except Exception as e:
        print(e)
        break