Skip to content

Commit ac2b4cd

Browse files
committed
Add ChefSpec for nvidia_kernel_module and modify CHANGELOG
1 parent ea33937 commit ac2b4cd

File tree

2 files changed

+107
-80
lines changed

2 files changed

+107
-80
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1313
- Intel Python: 2023.2.0
1414
- Critical Update for Intel oneAPI DPC++/C++ Compiler: 2023.2.1
1515
- Critical Update for Intel Fortran Compiler & Intel Fortran Compiler Classic: 2023.2.1
16+
- Add possibility to choose between Open and Closed Source Nvidia Drivers when building an AMI, through the ```['cluster']['nvidia']['kernel_open']``` cookbook node attribute.
1617

1718
**CHANGES**
1819
- Upgrade Slurm to 23.11.1 (from 23.02.7).

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb

Lines changed: 106 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,26 @@ def self.setup(chef_run, nvidia_driver_version: nil)
9595
end
9696
end
9797

98+
describe 'nvidia_driver:nvidia_kernel_module' do
99+
[%w(false kernel), %w(true kernel-open)].each do |kernel_open, kernel_module|
100+
context "node['cluster']['nvidia']['kernel_open'] is #{kernel_open}" do
101+
cached(:chef_run) do
102+
ChefSpec::SoloRunner.new(step_into: ['nvidia_driver']) do |node|
103+
node.override['cluster']['nvidia']['kernel_open'] = kernel_open
104+
end
105+
end
106+
cached(:resource) do
107+
ConvergeNvidiaDriver.setup(chef_run)
108+
chef_run.find_resource('nvidia_driver', 'setup')
109+
end
110+
it "is #{kernel_module}" do
111+
allow_any_instance_of(Object).to receive(:nvidia_kernel_module).and_return(kernel_module)
112+
expect(resource.nvidia_kernel_module).to eq(kernel_module)
113+
end
114+
end
115+
end
116+
end
117+
98118
describe 'nvidia_driver:nvidia_arch' do
99119
cached(:chef_run) do
100120
ChefSpec::SoloRunner.new(step_into: ['nvidia_driver'])
@@ -123,6 +143,7 @@ def self.setup(chef_run, nvidia_driver_version: nil)
123143
describe 'nvidia_driver:setup' do
124144
for_all_oses do |platform, version|
125145
cached(:nvidia_arch) { 'nvidia_arch' }
146+
cached(:nvidia_kernel_module) { 'nvidia_kernel_module' }
126147
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
127148
cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }
128149

@@ -140,104 +161,108 @@ def self.setup(chef_run, nvidia_driver_version: nil)
140161
end
141162
end
142163

143-
context "on #{platform}#{version} when nvidia_driver enabled" do
144-
cached(:nvidia_arch) { 'nvidia_arch' }
145-
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
146-
cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }
164+
[%w(false kernel), %w(true kernel-open)].each do |kernel_open, kernel_module|
165+
context "on #{platform}#{version} when nvidia_driver enabled and node['cluster']['nvidia']['kernel_open'] is #{kernel_open}" do
166+
cached(:nvidia_arch) { 'nvidia_arch' }
167+
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
168+
cached(:nvidia_kernel_module) { 'nvidia_kernel_module' }
169+
cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }
170+
171+
cached(:chef_run) do
172+
stubs_for_resource('nvidia_driver') do |res|
173+
allow(res).to receive(:nvidia_driver_enabled?).and_return(true)
174+
allow(res).to receive(:nvidia_arch).and_return(nvidia_arch)
175+
allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module)
176+
end
147177

148-
cached(:chef_run) do
149-
stubs_for_resource('nvidia_driver') do |res|
150-
allow(res).to receive(:nvidia_driver_enabled?).and_return(true)
151-
allow(res).to receive(:nvidia_arch).and_return(nvidia_arch)
152-
end
178+
stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true)
179+
allow(::File).to receive(:exist?).with('/usr/bin/nvidia-smi').and_return(false)
153180

154-
stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true)
155-
allow(::File).to receive(:exist?).with('/usr/bin/nvidia-smi').and_return(false)
181+
runner = runner(platform: platform, version: version, step_into: ['nvidia_driver']) do |node|
182+
node.automatic['kernel']['release'] = '5.anything'
183+
end
156184

157-
runner = runner(platform: platform, version: version, step_into: ['nvidia_driver']) do |node|
158-
node.automatic['kernel']['release'] = '5.anything'
185+
ConvergeNvidiaDriver.setup(runner, nvidia_driver_version: nvidia_driver_version)
159186
end
187+
cached(:node) { chef_run.node }
160188

161-
ConvergeNvidiaDriver.setup(runner, nvidia_driver_version: nvidia_driver_version)
162-
end
163-
cached(:node) { chef_run.node }
164-
165-
it 'sets up nvidia_driver' do
166-
is_expected.to setup_nvidia_driver('setup')
167-
end
168-
169-
it 'downloads nvidia driver' do
170-
is_expected.to create_remote_file('/tmp/nvidia.run').with(
171-
source: nvidia_driver_url,
172-
mode: '0755',
173-
retries: 3,
174-
retry_delay: 5
175-
)
176-
end
177-
178-
it 'uninstalls kernel module nouveau' do
179-
is_expected.to uninstall_kernel_module('nouveau')
180-
end
189+
it 'sets up nvidia_driver' do
190+
is_expected.to setup_nvidia_driver('setup')
191+
end
181192

182-
it 'creates file blacklist-nouveau.conf' do
183-
is_expected.to create_cookbook_file('blacklist-nouveau.conf').with(
184-
source: 'nvidia/blacklist-nouveau.conf',
185-
path: '/etc/modprobe.d/blacklist-nouveau.conf',
186-
owner: 'root',
187-
group: 'root',
188-
mode: '0644'
189-
)
190-
end
193+
it 'downloads nvidia driver' do
194+
is_expected.to create_remote_file('/tmp/nvidia.run').with(
195+
source: nvidia_driver_url,
196+
mode: '0755',
197+
retries: 3,
198+
retry_delay: 5
199+
)
200+
end
191201

192-
if platform == 'amazon'
193-
it 'installs gcc10' do
194-
is_expected.to install_package('gcc10').with_retries(10).with_retry_delay(5)
202+
it 'uninstalls kernel module nouveau' do
203+
is_expected.to uninstall_kernel_module('nouveau')
195204
end
196205

197-
it 'creates dkms/nvidia.conf' do
198-
is_expected.to create_cookbook_file('dkms/nvidia.conf').with(
199-
source: 'dkms/nvidia.conf',
200-
path: '/etc/dkms/nvidia.conf',
206+
it 'creates file blacklist-nouveau.conf' do
207+
is_expected.to create_cookbook_file('blacklist-nouveau.conf').with(
208+
source: 'nvidia/blacklist-nouveau.conf',
209+
path: '/etc/modprobe.d/blacklist-nouveau.conf',
201210
owner: 'root',
202211
group: 'root',
203212
mode: '0644'
204213
)
205214
end
206-
it 'installs nvidia driver' do
207-
is_expected.to run_bash('nvidia.run advanced')
208-
.with(
209-
user: 'root',
210-
group: 'root',
211-
cwd: '/tmp',
212-
creates: '/usr/bin/nvidia-smi'
213-
)
214-
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=kernel-open})
215-
.with_code(%r{rm -f /tmp/nvidia.run})
216-
end
217-
else
218-
it "doesn't install gcc10" do
219-
is_expected.not_to install_package('gcc10')
220-
end
221-
it 'installs nvidia driver' do
222-
is_expected.to run_bash('nvidia.run advanced')
223-
.with(
224-
user: 'root',
215+
216+
if platform == 'amazon'
217+
it 'installs gcc10' do
218+
is_expected.to install_package('gcc10').with_retries(10).with_retry_delay(5)
219+
end
220+
221+
it 'creates dkms/nvidia.conf' do
222+
is_expected.to create_cookbook_file('dkms/nvidia.conf').with(
223+
source: 'dkms/nvidia.conf',
224+
path: '/etc/dkms/nvidia.conf',
225+
owner: 'root',
225226
group: 'root',
226-
cwd: '/tmp',
227-
creates: '/usr/bin/nvidia-smi'
227+
mode: '0644'
228228
)
229-
.with_code(%r{./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=kernel-open})
230-
.with_code(%r{rm -f /tmp/nvidia.run})
229+
end
230+
it 'installs nvidia driver' do
231+
is_expected.to run_bash('nvidia.run advanced')
232+
.with(
233+
user: 'root',
234+
group: 'root',
235+
cwd: '/tmp',
236+
creates: '/usr/bin/nvidia-smi'
237+
)
238+
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
239+
.with_code(%r{rm -f /tmp/nvidia.run})
240+
end
241+
else
242+
it "doesn't install gcc10" do
243+
is_expected.not_to install_package('gcc10')
244+
end
245+
it 'installs nvidia driver' do
246+
is_expected.to run_bash('nvidia.run advanced')
247+
.with(
248+
user: 'root',
249+
group: 'root',
250+
cwd: '/tmp',
251+
creates: '/usr/bin/nvidia-smi'
252+
)
253+
.with_code(%r{./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
254+
.with_code(%r{rm -f /tmp/nvidia.run})
255+
end
231256
end
232-
end
233257

234-
if platform == 'ubuntu'
235-
it 'executes initramfs to remove nouveau' do
236-
is_expected.to run_execute('initramfs to remove nouveau').with_command('update-initramfs -u')
237-
end
238-
else
239-
it 'does not execute initramfs to remove nouveau' do
240-
is_expected.not_to run_execute('initramfs to remove nouveau').with_command('update-initramfs -u')
258+
if platform == 'ubuntu'
259+
it 'executes initramfs to remove nouveau' do
260+
is_expected.to run_execute('initramfs to remove nouveau').with_command('update-initramfs -u')
261+
end
262+
else
263+
it 'does not execute initramfs to remove nouveau' do
264+
is_expected.not_to run_execute('initramfs to remove nouveau').with_command('update-initramfs -u')
265+
end
241266
end
242267
end
243268
end
@@ -247,6 +272,7 @@ def self.setup(chef_run, nvidia_driver_version: nil)
247272
stubs_for_resource('nvidia_driver') do |res|
248273
allow(res).to receive(:nvidia_driver_enabled?).and_return(true)
249274
allow(res).to receive(:nvidia_arch).and_return(nvidia_arch)
275+
allow(res).to receive(:nvidia_kernel_module).and_return(nvidia_kernel_module)
250276
end
251277
runner(platform: platform, version: version, step_into: ['nvidia_driver'])
252278
end

0 commit comments

Comments
 (0)