CUDA Out of Memory during Inference

I am running Meta’s LLM Compiler (7B parameters) for inference on a single prompt on a V100 32GB GPU (which the model authors say should be sufficient for inference), but I get a CUDA out of memory error.
When monitoring the process with nvidia-smi, loading the model takes 26GB/32GB, and then the program reaches the out of memory error at this line:

  File "/home/<username>/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 309, in forward
    down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
                               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 404.00 MiB. GPU

I was wondering what could be causing this issue, possibly due to settings or environment variables that should be set? Also, when running the program my laptop (on CPU) with 32GB RAM, the process runs (slowly) without crashing, and never takes more than 26-27GB of RAM.

Below is the code I am using, provided by the model authors:

from transformers import AutoTokenizer, AutoModelForCausalLM
from textwrap import indent
import torch

MODEL_NAMES = [
    "facebook/llm-compiler-7b",
    "facebook/llm-compiler-13b",
    "facebook/llm-compiler-7b-ftd",
    "facebook/llm-compiler-13b-ftd",
]

class LLM_Compiler:
    def __init__(self, model_name: str = "facebook/llm-compiler-7b-ftd", device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        if model_name not in MODEL_NAMES:
            raise ValueError(f"model_name must be one of {MODEL_NAMES}")
        self.model_name = model_name
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name).to(self.device)
        self.model.eval()
    
    def infer(self, prompt: str, max_new_tokens: int = 50) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        outputs = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
        text: str = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return text[len(prompt): ]
    
    def emulate_ir(self, ir: str, passes: str, ir_count: int, bin_size: int, max_new_tokens: int = 50) -> str:
        prompt = f"""\
[INST] Give the LLVM-IR for the following code when optimized using opt -p '{passes}':
<code>{ir}</code>
The input code has instruction count {ir_count} and binary size {bin_size} bytes. [/INST]"""
    
        return self.infer(prompt, max_new_tokens=max_new_tokens)
    
    def emulate_asm(self, ir: str, passes: str, ir_count: int, bin_size: int, max_new_tokens: int = 50) -> str:
        prompt = f"""\
[INST] Give the assembly for the following code when optimized using opt -p '{passes}':
<code>{ir}</code>
The input code has instruction count {ir_count} and binary size {bin_size} bytes. [/INST]"""

        return self.infer(prompt, max_new_tokens=max_new_tokens)
    
    def optimize_for_code_size(self, ir: str, max_new_tokens: int = 50) -> str:
        prompt = f"""\
[INST] Tell me how to optimize this LLVM-IR for object file size:
<code>{ir}</code> [/INST]"""

        return self.infer(prompt, max_new_tokens=max_new_tokens)

    def disassemble(self, asm: str, max_new_tokens: int = 50) -> str:
        prompt = f"""\
[INST] Disassemble this code to LLVM-IR:
<code>{asm}</code> [/INST]"""

        return self.infer(prompt, max_new_tokens=max_new_tokens)

if __name__ == "__main__":
    # Demo the capabilities
    ir_count = 8
    bin_size = 65
    ir = """\
; ModuleID = '<stdin>'
source_filename = "-"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: minsize nounwind optsize uwtable
define dso_local i32 @add_two(i32 noundef %0, i32 noundef %1) #0 {
  %3 = alloca i32, align 4
  %4 = alloca i32, align 4
  store i32 %0, ptr %3, align 4, !tbaa !5
  store i32 %1, ptr %4, align 4, !tbaa !5
  %5 = load i32, ptr %3, align 4, !tbaa !5
  %6 = load i32, ptr %4, align 4, !tbaa !5
  %7 = add nsw i32 %5, %6
  ret i32 %7
}
attributes #0 = { minsize nounwind optsize uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
!llvm.module.flags = !{!0, !1, !2, !3}
!llvm.ident = !{!4}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{!"clang version 17.0.6 (git@github.com:fairinternal/CodeGen.git b05db9bbf7a92019267416c1bb9996fe6134e3f1)"}
!5 = !{!6, !6, i64 0}
!6 = !{!"int", !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C/C++ TBAA"}
"""

    asm = """\
.text
    .file	"-"
    .globl	add_two                         # -- Begin function add_two
    .type	add_two,@function
add_two:                                # @add_two
    .cfi_startproc
# %bb.0:
    movl	%edi, -4(%rsp)
    movl	%esi, -8(%rsp)
    movl	-4(%rsp), %eax
    addl	-8(%rsp), %eax
    retq
.Lfunc_end0:
    .size	add_two, .Lfunc_end0-add_two
    .cfi_endproc
                                        # -- End function
    .ident	"clang version 17.0.6 (git@github.com:fairinternal/CodeGen.git b05db9bbf7a92019267416c1bb9996fe6134e3f1)"
    .section	".note.GNU-stack","",@progbits
    .addrsig
"""

    passes = "module(default<Oz>)"
    max_new_tokens = 800

    # Get the model
    llm_compiler = LLM_Compiler()

    print(f"Emulating the 'opt -p {passes}' on an ir, producing ir")
    print(indent(llm_compiler.emulate_ir(ir, passes, ir_count, bin_size, max_new_tokens), "    "))

    print(f"Emulating the 'opt -p {passes}' on an ir, producing asm")
    print(indent(llm_compiler.emulate_asm(ir, passes, ir_count, bin_size, max_new_tokens), "    "))

    print(f"Getting the optimal passes for code size")
    print(indent(llm_compiler.optimize_for_code_size(ir, max_new_tokens), "    "))

    print(f"Disassembling to ir")
    print(indent(llm_compiler.disassemble(asm, max_new_tokens), "    "))

you may want to ask torch questions on a torch forum such as discuss.pytorch.org There are NVIDIA experts that patrol that forum.