#pragma once #include #include #include #include #include #include namespace torch::nativert { struct ProfileMetrics { size_t primNodesCount{0}; size_t staticDispatchNodesCount{0}; size_t totalNodesCount{0}; std::vector timePerNode; std::unordered_map timePerNodeType; std::unordered_map percentPerNodeType; std::unordered_map instancesPerNodeType; std::unordered_set staticDispatchNodes; std::unordered_set primNodes; float totalTime{0}; }; /** * GraphExecutor is a lightweight abstraction to execute a graph with * execution frames without actually owning the graph nor the weights. This is * introduced to decouple the state management of the top level runtime from the * kernel executions so that sub graphs from higher order ops can be supported. */ class GraphExecutorBase { public: GraphExecutorBase( const Graph& graph, std::vector> nodeKernels, const ExecutorConfig& executorConfig); virtual ~GraphExecutorBase() = default; const Graph& graph() const { return graph_; } // This API only returns the flattened UserOutputs, // intended to be used for Inference path virtual std::vector execute( ExecutionFrame& frame, std::vector inputs) = 0; virtual std::vector executeWithPrefilledFrame( ExecutionFrame& frame) = 0; ProfileMetrics benchmarkIndividualNodes( ExecutionFrame& executionFrame, std::vector> inputs, const uint32_t warmup_runs, const uint32_t main_runs); std::vector> stealKernels() { return std::move(nodeKernels_); } void setKernels(std::vector>&& kernels) { nodeKernels_ = std::move(kernels); } protected: void fillUserInputs(ExecutionFrame& frame, std::vector inputs); const Graph& graph_; // cache of the constructed kernels to avoid reconstruction per execution std::vector> nodeKernels_; const ExecutorConfig& executorConfig_; std::unique_ptr execPlan_; }; } // namespace torch::nativert