Skip to content

fix tests on Triton T #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 73 additions & 24 deletions data/TritonBench_T_v1/Adam.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,88 @@
import torch

def Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0):
return torch.optim.Adam(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
def simple_adam_step(
params: torch.Tensor,
grads: torch.Tensor,
lr: float = 0.001,
eps: float = 1e-08,
weight_decay: float = 0
) -> torch.Tensor:
"""
Performs a single simplified step resembling the Adam optimizer update rule.
This implementation omits the exponential moving averages (m, v) used in standard Adam,
calculating the update based only on the current gradient.

##################################################################################################################################################
Args:
params: Parameters to optimize.
grads: Gradients of the parameters.
lr: Learning rate.
eps: Term added to the denominator to improve numerical stability.
weight_decay: Weight decay (L2 penalty).

Returns:
Tensor: The updated parameters.
"""

import torch
grad = grads

if weight_decay != 0:
grad = grad.add(params.detach(), alpha=weight_decay)

m_hat = grad
v_hat = grad * grad

def Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0):
return torch.optim.Adam(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
# Denominator term in the Adam update rule
denom = torch.sqrt(v_hat).add_(eps)

def test_Adam():
update_amount = lr * m_hat / denom

new_params = params - update_amount

return new_params

##################################################################################################################################################

import torch
torch.manual_seed(42)

def test_simple_adam_step():
results = {}

# Test Case 1: Default parameters
params1 = [torch.randn(2, 2, device='cuda', requires_grad=True)]
optimizer1 = Adam(params1)
results["test_case_1"] = optimizer1.defaults
# Basic test case
params = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device='cuda', requires_grad=True)
grads = torch.tensor([[0.1, 0.2], [0.3, 0.4]], device='cuda')
lr = 0.001
eps = 1e-8

updated_params = simple_adam_step(params.clone(), grads, lr=lr, eps=eps, weight_decay=0)

# Check output shape and type
results['basic_shape_match'] = updated_params.shape == params.shape
results['basic_dtype_match'] = updated_params.dtype == params.dtype
results['basic_device_match'] = updated_params.device == params.device

# Check calculation (simplified for demonstration)
expected_update = lr * torch.sign(grads)
# Using a loose check for demonstration
results['basic_calculation_approx_correct'] = torch.all(torch.abs((params - updated_params) - expected_update) < lr * 0.5).item()

# Test with weight decay
params_wd = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device='cuda', requires_grad=True)
grads_wd = torch.tensor([[0.1, 0.2], [0.3, 0.4]], device='cuda')
weight_decay = 0.01

# Test Case 2: Custom learning rate
params2 = [torch.randn(2, 2, device='cuda', requires_grad=True)]
optimizer2 = Adam(params2, lr=0.01)
results["test_case_2"] = optimizer2.defaults
updated_params_wd = simple_adam_step(params_wd.clone(), grads_wd, lr=lr, eps=eps, weight_decay=weight_decay)

# Test Case 3: Custom betas
params3 = [torch.randn(2, 2, device='cuda', requires_grad=True)]
optimizer3 = Adam(params3, betas=(0.85, 0.95))
results["test_case_3"] = optimizer3.defaults
# Check output shape and type for weight decay case
results['wd_shape_match'] = updated_params_wd.shape == params_wd.shape
results['wd_dtype_match'] = updated_params_wd.dtype == params_wd.dtype
results['wd_device_match'] = updated_params_wd.device == params_wd.device

# Test Case 4: Custom weight decay
params4 = [torch.randn(2, 2, device='cuda', requires_grad=True)]
optimizer4 = Adam(params4, weight_decay=0.01)
results["test_case_4"] = optimizer4.defaults
# Check that weight decay modified the update
results['wd_params_different_from_basic'] = not torch.allclose(updated_params_wd, updated_params)

return results

test_results = test_Adam()
# Run the tests and print the results dictionary
test_results = test_simple_adam_step()
print(test_results)
92 changes: 34 additions & 58 deletions data/TritonBench_T_v1/SGD.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,54 @@
import torch
import torch.nn as nn

def SGD(model, input, target, loss_fn, lr=0.1, momentum=0.9):
def SGD_step(parameters: torch.Tensor, grads: torch.Tensor, lr=0.1):
"""
Performs a single step of SGD optimization.

Args:
- model (torch.nn.Module): The model to optimize.
- input (torch.Tensor): The input tensor for the model.
- target (torch.Tensor): The target tensor.
- loss_fn (callable): The loss function.
- lr (float, optional): The learning rate for the optimizer. Default is 0.1.
- momentum (float, optional): The momentum for the optimizer. Default is 0.9.

Returns:
- loss (torch.Tensor): The computed loss for the step.
Updates parameters in-place.
"""
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
optimizer.zero_grad()
loss = loss_fn(model(input), target)
loss.backward()
optimizer.step()
return loss
with torch.no_grad():
for param, grad in zip(parameters, grads):
if grad is None:
continue
# Update rule: param = param - lr * grad
param.add_(grad, alpha=-lr)

##################################################################################################################################################


import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleModel(nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.fc = nn.Linear(10, 1)

def forward(self, x):
return self.fc(x)
torch.manual_seed(42)

def test_SGD():
results = {}

# Test case 1: Basic functionality
model = SimpleModel().cuda()
input = torch.randn(5, 10).cuda()
target = torch.randn(5, 1).cuda()
loss_fn = nn.MSELoss()
loss = SGD(model, input, target, loss_fn)
results["test_case_1"] = loss.item()
params1 = [torch.ones(2, 2, requires_grad=True, device='cuda'), torch.zeros(3, requires_grad=True, device='cuda')]
grads1 = [torch.full_like(params1[0], 2.0), torch.full_like(params1[1], -1.0)]
expected_params1 = [params1[0].clone() - 0.1 * grads1[0], params1[1].clone() - 0.1 * grads1[1]]
SGD_step(params1, grads1, lr=0.1)
results["test_case_1_param0"] = params1[0]
results["test_case_1_param1"] = params1[1]

# Test case 2: Different learning rate
model = SimpleModel().cuda()
input = torch.randn(5, 10).cuda()
target = torch.randn(5, 1).cuda()
loss_fn = nn.MSELoss()
loss = SGD(model, input, target, loss_fn, lr=0.01)
results["test_case_2"] = loss.item()

# Test case 3: Different momentum
model = SimpleModel().cuda()
input = torch.randn(5, 10).cuda()
target = torch.randn(5, 1).cuda()
loss_fn = nn.MSELoss()
loss = SGD(model, input, target, loss_fn, momentum=0.5)
results["test_case_3"] = loss.item()
# Test case 2: Different learning rate
params2 = [torch.ones(2, 2, requires_grad=True, device='cuda'), torch.zeros(3, requires_grad=True, device='cuda')]
grads2 = [torch.full_like(params2[0], 2.0), torch.full_like(params2[1], -1.0)]
lr2 = 0.01
expected_params2 = [params2[0].clone() - lr2 * grads2[0], params2[1].clone() - lr2 * grads2[1]]
SGD_step(params2, grads2, lr=lr2)
results["test_case_2_param0"] = params2[0]
results["test_case_2_param1"] = params2[1]

# Test case 3: Gradient is None for one parameter
params3 = [torch.ones(2, 2, requires_grad=True, device='cuda'), torch.zeros(3, requires_grad=True, device='cuda')]
grads3 = [torch.full_like(params3[0], 2.0), None] # Grad for second param is None
expected_params3_0 = params3[0].clone() - 0.1 * grads3[0]
expected_params3_1 = params3[1].clone() # Should remain unchanged
SGD_step(params3, grads3, lr=0.1)
results["test_case_3_param0"] = params3[0]
results["test_case_3_param1"] = params3[1] # Should be tensor of zeros

# Test case 4: Different loss function
model = SimpleModel().cuda()
input = torch.randn(5, 10).cuda()
target = torch.randint(0, 2, (5, 1)).float().cuda()
loss_fn = nn.BCEWithLogitsLoss()
loss = SGD(model, input, target, loss_fn)
results["test_case_4"] = loss.item()

return results

test_results = test_SGD()
print(test_results)
4 changes: 3 additions & 1 deletion data/TritonBench_T_v1/abs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch

def abs(input_tensor, out=None):
def abs(input_tensor: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
"""
Computes the absolute value of each element in the input tensor.

Expand All @@ -17,6 +17,7 @@ def abs(input_tensor, out=None):


import torch
torch.manual_seed(42)

def test_abs():
results = {}
Expand All @@ -40,3 +41,4 @@ def test_abs():
return results

test_results = test_abs()
print(test_results)
7 changes: 4 additions & 3 deletions data/TritonBench_T_v1/adaptive_avg_pool2d.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
import torch.nn.functional as F

def adaptive_avg_pool2d(input, output_size):
from typing import Union, Tuple
def adaptive_avg_pool2d(input: torch.Tensor, output_size: Union[int, Tuple[int, int]]) -> torch.Tensor:
"""
Apply 2D adaptive average pooling over an input signal.

Expand All @@ -28,7 +28,7 @@ def adaptive_avg_pool2d(input, output_size):


import torch
from adaptive_avg_pool2d import adaptive_avg_pool2d
torch.manual_seed(42)

def test_adaptive_avg_pool2d():
results = {}
Expand Down Expand Up @@ -56,3 +56,4 @@ def test_adaptive_avg_pool2d():
return results

test_results = test_adaptive_avg_pool2d()
print(test_results)
8 changes: 7 additions & 1 deletion data/TritonBench_T_v1/add.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import torch

def add(input, other, alpha=1, out=None):
def add(
input: torch.Tensor,
other: torch.Tensor,
alpha: float = 1,
out: torch.Tensor = None) -> torch.Tensor:
"""
Adds the tensor or number 'other', scaled by 'alpha', to the 'input' tensor.

Expand All @@ -19,6 +23,7 @@ def add(input, other, alpha=1, out=None):


import torch
torch.manual_seed(42)

def test_add():
results = {}
Expand Down Expand Up @@ -46,3 +51,4 @@ def test_add():
return results

test_results = test_add()
print(test_results)
10 changes: 8 additions & 2 deletions data/TritonBench_T_v1/add_gelu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
import torch.nn.functional as F
import torch

def add_gelu(input, other, alpha=1, approximate='none', out=None):
def add_gelu(
input: torch.Tensor,
other: torch.Tensor,
alpha: float = 1,
approximate: str = 'none',
out: torch.Tensor = None) -> torch.Tensor:
"""
Adds the tensor or number `other`, scaled by the multiplier `alpha`, to the input tensor `input`,
and then applies the Gaussian Error Linear Units (GELU) activation function to the result.
Expand Down Expand Up @@ -30,7 +35,7 @@ def add_gelu(input, other, alpha=1, approximate='none', out=None):


import torch
import torch.nn.functional as F
torch.manual_seed(42)

def test_add_gelu():
results = {}
Expand All @@ -55,3 +60,4 @@ def test_add_gelu():
return results

test_results = test_add_gelu()
print(test_results)
11 changes: 10 additions & 1 deletion data/TritonBench_T_v1/add_mean.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import torch

def add_mean(input, other, dim=None, alpha=1, keepdim=False, dtype=None, out=None):
def add_mean(
input: torch.Tensor,
other: torch.Tensor,
dim: int = None,
alpha: float = 1,
keepdim: bool = False,
dtype: torch.dtype = None,
out: torch.Tensor = None) -> torch.Tensor:
"""
Adds the `other` tensor, scaled by `alpha`, to the `input` tensor and computes the mean value
along the specified dimension(s).
Expand All @@ -27,6 +34,7 @@ def add_mean(input, other, dim=None, alpha=1, keepdim=False, dtype=None, out=Non


import torch
torch.manual_seed(42)

def test_add_mean():
results = {}
Expand Down Expand Up @@ -54,3 +62,4 @@ def test_add_mean():
return results

test_results = test_add_mean()
print(test_results)
11 changes: 10 additions & 1 deletion data/TritonBench_T_v1/addmm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import torch

def addmm(input: torch.Tensor, mat1: torch.Tensor, mat2: torch.Tensor, beta: float=1, alpha: float=1, out: torch.Tensor=None) -> torch.Tensor:
def addmm(
input: torch.Tensor,
mat1: torch.Tensor,
mat2: torch.Tensor,
beta: float=1,
alpha: float=1,
out: torch.Tensor=None
) -> torch.Tensor:
"""
Performs matrix multiplication of mat1 and mat2, and adds input to the result.

Expand Down Expand Up @@ -28,6 +35,7 @@ def addmm(input: torch.Tensor, mat1: torch.Tensor, mat2: torch.Tensor, beta: flo


import torch
torch.manual_seed(42)

def test_addmm():
results = {}
Expand All @@ -53,3 +61,4 @@ def test_addmm():
return results

test_results = test_addmm()
print(test_results)
Loading