thunlp · tcapelle · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/data/TritonBench_T_v1/Adam.py b/data/TritonBench_T_v1/Adam.py
@@ -1,39 +1,88 @@
 import torch
 
-def Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0):
-    return torch.optim.Adam(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+def simple_adam_step(
+        params: torch.Tensor,
+        grads: torch.Tensor,
+        lr: float = 0.001,
+        eps: float = 1e-08,
+        weight_decay: float = 0
+        ) -> torch.Tensor:
+    """
+    Performs a single simplified step resembling the Adam optimizer update rule.
+    This implementation omits the exponential moving averages (m, v) used in standard Adam,
+    calculating the update based only on the current gradient.
 
-##################################################################################################################################################
+    Args:
+        params: Parameters to optimize.
+        grads: Gradients of the parameters.
+        lr: Learning rate.
+        eps: Term added to the denominator to improve numerical stability.
+        weight_decay: Weight decay (L2 penalty).
 
+    Returns:
+        Tensor: The updated parameters.
+    """
 
-import torch
+    grad = grads
+
+    if weight_decay != 0:
+        grad = grad.add(params.detach(), alpha=weight_decay)
+
+    m_hat = grad
+    v_hat = grad * grad
 
-def Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0):
-    return torch.optim.Adam(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+    # Denominator term in the Adam update rule
+    denom = torch.sqrt(v_hat).add_(eps)
 
-def test_Adam():
+    update_amount = lr * m_hat / denom
+
+    new_params = params - update_amount
+
+    return new_params
+
+##################################################################################################################################################
+
+import torch
+torch.manual_seed(42)
+
+def test_simple_adam_step():
     results = {}
 
-    # Test Case 1: Default parameters
-    params1 = [torch.randn(2, 2, device='cuda', requires_grad=True)]
-    optimizer1 = Adam(params1)
-    results["test_case_1"] = optimizer1.defaults
+    # Basic test case
+    params = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device='cuda', requires_grad=True)
+    grads = torch.tensor([[0.1, 0.2], [0.3, 0.4]], device='cuda')
+    lr = 0.001
+    eps = 1e-8
+
+    updated_params = simple_adam_step(params.clone(), grads, lr=lr, eps=eps, weight_decay=0)
+
+    # Check output shape and type
+    results['basic_shape_match'] = updated_params.shape == params.shape
+    results['basic_dtype_match'] = updated_params.dtype == params.dtype
+    results['basic_device_match'] = updated_params.device == params.device
+
+    # Check calculation (simplified for demonstration)
+    expected_update = lr * torch.sign(grads)
+    # Using a loose check for demonstration
+    results['basic_calculation_approx_correct'] = torch.all(torch.abs((params - updated_params) - expected_update) < lr * 0.5).item()
+
+    # Test with weight decay
+    params_wd = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device='cuda', requires_grad=True)
+    grads_wd = torch.tensor([[0.1, 0.2], [0.3, 0.4]], device='cuda')
+    weight_decay = 0.01
 
-    # Test Case 2: Custom learning rate
-    params2 = [torch.randn(2, 2, device='cuda', requires_grad=True)]
-    optimizer2 = Adam(params2, lr=0.01)
-    results["test_case_2"] = optimizer2.defaults
+    updated_params_wd = simple_adam_step(params_wd.clone(), grads_wd, lr=lr, eps=eps, weight_decay=weight_decay)
 
-    # Test Case 3: Custom betas
-    params3 = [torch.randn(2, 2, device='cuda', requires_grad=True)]
-    optimizer3 = Adam(params3, betas=(0.85, 0.95))
-    results["test_case_3"] = optimizer3.defaults
+    # Check output shape and type for weight decay case
+    results['wd_shape_match'] = updated_params_wd.shape == params_wd.shape
+    results['wd_dtype_match'] = updated_params_wd.dtype == params_wd.dtype
+    results['wd_device_match'] = updated_params_wd.device == params_wd.device
 
-    # Test Case 4: Custom weight decay
-    params4 = [torch.randn(2, 2, device='cuda', requires_grad=True)]
-    optimizer4 = Adam(params4, weight_decay=0.01)
-    results["test_case_4"] = optimizer4.defaults
+    # Check that weight decay modified the update
+    results['wd_params_different_from_basic'] = not torch.allclose(updated_params_wd, updated_params)
 
     return results
 
-test_results = test_Adam()
+# Run the tests and print the results dictionary
+test_results = test_simple_adam_step()
+print(test_results)
diff --git a/data/TritonBench_T_v1/SGD.py b/data/TritonBench_T_v1/SGD.py
@@ -1,78 +1,54 @@
 import torch
-import torch.nn as nn
 
-def SGD(model, input, target, loss_fn, lr=0.1, momentum=0.9):
+def SGD_step(parameters: torch.Tensor, grads: torch.Tensor, lr=0.1):
     """
     Performs a single step of SGD optimization.
-
-    Args:
-    - model (torch.nn.Module): The model to optimize.
-    - input (torch.Tensor): The input tensor for the model.
-    - target (torch.Tensor): The target tensor.
-    - loss_fn (callable): The loss function.
-    - lr (float, optional): The learning rate for the optimizer. Default is 0.1.
-    - momentum (float, optional): The momentum for the optimizer. Default is 0.9.
-
-    Returns:
-    - loss (torch.Tensor): The computed loss for the step.
+    Updates parameters in-place.
     """
-    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
-    optimizer.zero_grad()
-    loss = loss_fn(model(input), target)
-    loss.backward()
-    optimizer.step()
-    return loss
+    with torch.no_grad():
+        for param, grad in zip(parameters, grads):
+            if grad is None:
+                continue
+            # Update rule: param = param - lr * grad
+            param.add_(grad, alpha=-lr)
 
 ##################################################################################################################################################
 
-
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class SimpleModel(nn.Module):
-    def __init__(self):
-        super(SimpleModel, self).__init__()
-        self.fc = nn.Linear(10, 1)
-
-    def forward(self, x):
-        return self.fc(x)
+torch.manual_seed(42)
 
 def test_SGD():
     results = {}
-    
+
     # Test case 1: Basic functionality
-    model = SimpleModel().cuda()
-    input = torch.randn(5, 10).cuda()
-    target = torch.randn(5, 1).cuda()
-    loss_fn = nn.MSELoss()
-    loss = SGD(model, input, target, loss_fn)
-    results["test_case_1"] = loss.item()
+    params1 = [torch.ones(2, 2, requires_grad=True, device='cuda'), torch.zeros(3, requires_grad=True, device='cuda')]
+    grads1 = [torch.full_like(params1[0], 2.0), torch.full_like(params1[1], -1.0)]
+    expected_params1 = [params1[0].clone() - 0.1 * grads1[0], params1[1].clone() - 0.1 * grads1[1]]
+    SGD_step(params1, grads1, lr=0.1)
+    results["test_case_1_param0"] = params1[0]
+    results["test_case_1_param1"] = params1[1]
 
-    # Test case 2: Different learning rate
-    model = SimpleModel().cuda()
-    input = torch.randn(5, 10).cuda()
-    target = torch.randn(5, 1).cuda()
-    loss_fn = nn.MSELoss()
-    loss = SGD(model, input, target, loss_fn, lr=0.01)
-    results["test_case_2"] = loss.item()
 
-    # Test case 3: Different momentum
-    model = SimpleModel().cuda()
-    input = torch.randn(5, 10).cuda()
-    target = torch.randn(5, 1).cuda()
-    loss_fn = nn.MSELoss()
-    loss = SGD(model, input, target, loss_fn, momentum=0.5)
-    results["test_case_3"] = loss.item()
+    # Test case 2: Different learning rate
+    params2 = [torch.ones(2, 2, requires_grad=True, device='cuda'), torch.zeros(3, requires_grad=True, device='cuda')]
+    grads2 = [torch.full_like(params2[0], 2.0), torch.full_like(params2[1], -1.0)]
+    lr2 = 0.01
+    expected_params2 = [params2[0].clone() - lr2 * grads2[0], params2[1].clone() - lr2 * grads2[1]]
+    SGD_step(params2, grads2, lr=lr2)
+    results["test_case_2_param0"] = params2[0]
+    results["test_case_2_param1"] = params2[1]
+
+    # Test case 3: Gradient is None for one parameter
+    params3 = [torch.ones(2, 2, requires_grad=True, device='cuda'), torch.zeros(3, requires_grad=True, device='cuda')]
+    grads3 = [torch.full_like(params3[0], 2.0), None] # Grad for second param is None
+    expected_params3_0 = params3[0].clone() - 0.1 * grads3[0]
+    expected_params3_1 = params3[1].clone() # Should remain unchanged
+    SGD_step(params3, grads3, lr=0.1)
+    results["test_case_3_param0"] = params3[0]
+    results["test_case_3_param1"] = params3[1] # Should be tensor of zeros
 
-    # Test case 4: Different loss function
-    model = SimpleModel().cuda()
-    input = torch.randn(5, 10).cuda()
-    target = torch.randint(0, 2, (5, 1)).float().cuda()
-    loss_fn = nn.BCEWithLogitsLoss()
-    loss = SGD(model, input, target, loss_fn)
-    results["test_case_4"] = loss.item()
 
     return results
 
 test_results = test_SGD()
+print(test_results)
diff --git a/data/TritonBench_T_v1/abs.py b/data/TritonBench_T_v1/abs.py
@@ -1,6 +1,6 @@
 import torch
 
-def abs(input_tensor, out=None):
+def abs(input_tensor: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
     """
     Computes the absolute value of each element in the input tensor.
 
@@ -17,6 +17,7 @@ def abs(input_tensor, out=None):
 
 
 import torch
+torch.manual_seed(42)
 
 def test_abs():
     results = {}
@@ -40,3 +41,4 @@ def test_abs():
     return results
 
 test_results = test_abs()
+print(test_results)
diff --git a/data/TritonBench_T_v1/adaptive_avg_pool2d.py b/data/TritonBench_T_v1/adaptive_avg_pool2d.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn.functional as F
-
-def adaptive_avg_pool2d(input, output_size):
+from typing import Union, Tuple
+def adaptive_avg_pool2d(input: torch.Tensor, output_size: Union[int, Tuple[int, int]]) -> torch.Tensor:
     """
     Apply 2D adaptive average pooling over an input signal.
 
@@ -28,7 +28,7 @@ def adaptive_avg_pool2d(input, output_size):
 
 
 import torch
-from adaptive_avg_pool2d import adaptive_avg_pool2d
+torch.manual_seed(42)
 
 def test_adaptive_avg_pool2d():
     results = {}
@@ -56,3 +56,4 @@ def test_adaptive_avg_pool2d():
     return results
 
 test_results = test_adaptive_avg_pool2d()
+print(test_results)
diff --git a/data/TritonBench_T_v1/add.py b/data/TritonBench_T_v1/add.py
@@ -1,6 +1,10 @@
 import torch
 
-def add(input, other, alpha=1, out=None):
+def add(
+        input: torch.Tensor, 
+        other: torch.Tensor, 
+        alpha: float = 1, 
+        out: torch.Tensor = None) -> torch.Tensor:
     """
     Adds the tensor or number 'other', scaled by 'alpha', to the 'input' tensor.
 
@@ -19,6 +23,7 @@ def add(input, other, alpha=1, out=None):
 
 
 import torch
+torch.manual_seed(42)
 
 def test_add():
     results = {}
@@ -46,3 +51,4 @@ def test_add():
     return results
 
 test_results = test_add()
+print(test_results)
diff --git a/data/TritonBench_T_v1/add_gelu.py b/data/TritonBench_T_v1/add_gelu.py
@@ -2,7 +2,12 @@
 import torch.nn.functional as F
 import torch
 
-def add_gelu(input, other, alpha=1, approximate='none', out=None):
+def add_gelu(
+        input: torch.Tensor, 
+        other: torch.Tensor, 
+        alpha: float = 1, 
+        approximate: str = 'none', 
+        out: torch.Tensor = None) -> torch.Tensor:
     """
     Adds the tensor or number `other`, scaled by the multiplier `alpha`, to the input tensor `input`,
     and then applies the Gaussian Error Linear Units (GELU) activation function to the result.
@@ -30,7 +35,7 @@ def add_gelu(input, other, alpha=1, approximate='none', out=None):
 
 
 import torch
-import torch.nn.functional as F
+torch.manual_seed(42)
 
 def test_add_gelu():
     results = {}
@@ -55,3 +60,4 @@ def test_add_gelu():
     return results
 
 test_results = test_add_gelu()
+print(test_results)
diff --git a/data/TritonBench_T_v1/add_mean.py b/data/TritonBench_T_v1/add_mean.py
@@ -1,6 +1,13 @@
 import torch
 
-def add_mean(input, other, dim=None, alpha=1, keepdim=False, dtype=None, out=None):
+def add_mean(
+        input: torch.Tensor, 
+        other: torch.Tensor, 
+        dim: int = None, 
+        alpha: float = 1, 
+        keepdim: bool = False, 
+        dtype: torch.dtype = None, 
+        out: torch.Tensor = None) -> torch.Tensor:
     """
     Adds the `other` tensor, scaled by `alpha`, to the `input` tensor and computes the mean value
     along the specified dimension(s).
@@ -27,6 +34,7 @@ def add_mean(input, other, dim=None, alpha=1, keepdim=False, dtype=None, out=Non
 
 
 import torch
+torch.manual_seed(42)
 
 def test_add_mean():
     results = {}
@@ -54,3 +62,4 @@ def test_add_mean():
     return results
 
 test_results = test_add_mean()
+print(test_results)
diff --git a/data/TritonBench_T_v1/addmm.py b/data/TritonBench_T_v1/addmm.py
@@ -1,6 +1,13 @@
 import torch
 
-def addmm(input: torch.Tensor, mat1: torch.Tensor, mat2: torch.Tensor, beta: float=1, alpha: float=1, out: torch.Tensor=None) -> torch.Tensor:
+def addmm(
+        input: torch.Tensor, 
+        mat1: torch.Tensor, 
+        mat2: torch.Tensor, 
+        beta: float=1, 
+        alpha: float=1, 
+        out: torch.Tensor=None
+        ) -> torch.Tensor:
     """
     Performs matrix multiplication of mat1 and mat2, and adds input to the result.
 
@@ -28,6 +35,7 @@ def addmm(input: torch.Tensor, mat1: torch.Tensor, mat2: torch.Tensor, beta: flo
 
 
 import torch
+torch.manual_seed(42)
 
 def test_addmm():
     results = {}
@@ -53,3 +61,4 @@ def test_addmm():
     return results
 
 test_results = test_addmm()
+print(test_results)