import tvm import numpy as np # Writing a schedule in TVM # We will implement a 2D convolution operator typically found in models like # ResNet-18. # This file has three TODOs: # 1. Complete the compute declaration which defines the 2D convolution # algorithm. You can inspect the slow reference implementation in Python for a # naive algorithm implementation, or read about the computation structure @ # http://cs231n.github.io/convolutional-networks/#conv # Note that after this first step, you can verify the correctness of your # compute declaration without completing the second step by just running this # script. The default schedule along with the time to run will be printed. # 2. Schedule the convolution computation after writing the compute declaration # for better performance. You can look at an example schedule for a similar # operation (matrix multiply) @ # https://docs.tvm.ai/tutorials/optimize/opt_gemm.html#sphx-glr-tutorials-optimize-opt-gemm-py # 3. Explain each of the schedule transformations you used and the performance # improvement of the kernel over the default version. # These parameters define the shape of the convolution operator (it is important # that you do not change these as they will impact the consistency of our # results when we check the performance of your schedule). # To speed up debugging, you can set the number of input channels to 1 # temporarily to run the reference slow implementation quickly. # By default we compile all code for Intel x86 CPUs using avx-2 # If you receive an error regarding illegal instructions, try removing the # "-mcpu=core-avx2" flag from the target in the call to tvm.build # Shape definition for the convolution input_channels = 64 output_channels = 64 kernel_size = 3 input_height = 56 input_width = 56 padding = (1,1) output_height = (input_height + 2*padding[0] - kernel_size + 1) output_width = (input_width + 2*padding[1] - kernel_size + 1) # We define the input in H, W, C (height, width, channels) layout input_shape = (input_height+2*padding[0], input_width+2*padding[1], input_channels) # We define the kernel weights in H, W, I, O (kernel height, kernel width, input # channel, output channel) layout weight_shape = (kernel_size, kernel_size, input_channels, output_channels) # Wed define the output in H, W, C (height, width, channels) layout output_shape = (output_height, output_width, output_channels) # slow version # We provide a _very_ slow reference implementation written in Python so that # you can compare the correctness of your results after writing your TVM compute # declaration. def slow(data, weight): output = np.empty(output_shape) for output_channel in range(0, output_channels): print("channel:", output_channel) for output_y in range(0, output_height): for output_x in range(0, output_width): accum = 0.0 input_y = output_y input_x = output_x for input_channel in range(0, input_channels): for kernel_y in range(0, kernel_size): for kernel_x in range(0, kernel_size): accum +=\ np.float32(data[input_y + kernel_y][input_x + kernel_x][input_channel])\ *np.float32(weight[kernel_y][kernel_x][input_channel][output_channel]) output[output_y][output_x][output_channel] = np.float32(accum) return output def conv2d_nhwc(): input_placeholder = tvm.placeholder(input_shape, name='data') weight_placeholder = tvm.placeholder(weight_shape, name='weight') rc = tvm.reduce_axis((0, input_channels), name='rc') ry = tvm.reduce_axis((0, kernel_size), name='ry') rx = tvm.reduce_axis((0, kernel_size), name='rx') comp = tvm.compute((output_height, output_width, output_channels), lambda output_y, output_x, output_channel: #TODO #1: fill in lambda function to define the compute declaration ) s = tvm.create_schedule(comp.op) schedule(s, comp) print(tvm.lower(s, [input_placeholder, weight_placeholder, comp], simple_mode=True)) func = tvm.build(s, [input_placeholder, weight_placeholder, comp], target='llvm -mcpu=core-avx2', name='conv') return func def schedule(s, comp): yo, xo, co = comp.op.axis ry, rx, rc = s[comp].op.reduce_axis #TODO #2: write the rest of the schedule function # The goal is to achieve 2x the performance of the default schedule on your # machine def main(): func = conv2d_nhwc() data = np.random.random(input_shape).astype('float32') weight = np.random.random(weight_shape).astype('float32') data_tvm = tvm.nd.array(data) weight_tvm = tvm.nd.array(weight) output_tvm = tvm.nd.array(np.empty(output_shape).astype('float32')) timer = func.time_evaluator(func.entry_name, tvm.cpu(0), min_repeat_ms=100) res = timer(data_tvm, weight_tvm, output_tvm) # Print statement showing timing information #TODO #3: report the relative speedup and run time numbers of the schedule #kernel and the default schedule kernel #explain each of the schedule transformations you used, and how #they impacted the performance of the kernel print(res) output_tvm_numpy = output_tvm.asnumpy() output = slow(data, weight) #print(output_tvm_numpy) #print(output) np.testing.assert_allclose(output, output_tvm_numpy, rtol=1e-5) if __name__ == '__main__': main()