Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[compat]
ExprTools = "0.1"
InteractiveUtils = "1"
LLVM = "9.6"
LLVM = "9.7"
Libdl = "1"
Logging = "1"
PrecompileTools = "1"
Expand Down
149 changes: 113 additions & 36 deletions src/optim.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,126 +68,201 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob),
add!(mpm, VerifierPass())
end
add!(mpm, ForceFunctionAttrsPass())
# TODO invokePipelineStartCallbacks
if LLVM.version() >= v"17"
add!(mpm, PipelineStartCallbacks(; opt_level))
end
add!(mpm, Annotation2MetadataPass())
add!(mpm, InferFunctionAttrsPass())
add!(mpm, ConstantMergePass())
add!(mpm, NewPMFunctionPassManager()) do fpm
add!(fpm, LowerExpectIntrinsicPass())
if opt_level >= 2
add!(fpm, PropagateJuliaAddrspacesPass())
end
# DCE must come before simplifycfg: codegen can generate unused
# statements that would otherwise alter how simplifycfg optimizes the CFG.
add!(fpm, DCEPass())
add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...))
if opt_level >= 1
add!(fpm, DCEPass())
add!(fpm, SROAPass())
add!(fpm, EarlyCSEPass())
end
end
# TODO invokeEarlySimplificationCallbacks
if opt_level >= 1
add!(mpm, GlobalOptPass())
add!(mpm, NewPMFunctionPassManager()) do fpm
add!(fpm, PromotePass())
add!(fpm, InstCombinePass())
end
end
if LLVM.version() >= v"17"
add!(mpm, PipelineEarlySimplificationCallbacks(; opt_level))
end
end

function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
if LLVM.version() >= v"17"
add!(mpm, OptimizerEarlyCallbacks(; opt_level))
end
add!(mpm, NewPMCGSCCPassManager()) do cgpm
# TODO invokeCGSCCCallbacks
add!(cgpm, NewPMFunctionPassManager()) do fpm
add!(fpm, AllocOptPass())
add!(fpm, Float2IntPass())
add!(fpm, LowerConstantIntrinsicsPass())
if LLVM.version() >= v"17"
add!(cgpm, CGSCCOptimizerLateCallbacks(; opt_level))
end
if opt_level >= 2
add!(cgpm, NewPMFunctionPassManager()) do fpm
add!(fpm, AllocOptPass())
add!(fpm, Float2IntPass())
add!(fpm, LowerConstantIntrinsicsPass())
end
end
end
add!(mpm, GPULowerCPUFeaturesPass())
if opt_level >= 1
add!(mpm, NewPMFunctionPassManager()) do fpm
if opt_level >= 2
add!(fpm, SROAPass())
add!(fpm, EarlyCSEPass(; memssa=true))
add!(fpm, InstCombinePass())
add!(fpm, AggressiveInstCombinePass())
add!(fpm, JumpThreadingPass())
add!(fpm, CorrelatedValuePropagationPass())
add!(fpm, LibCallsShrinkWrapPass())
add!(fpm, ReassociatePass())
add!(fpm, EarlyCSEPass())
add!(fpm, ConstraintEliminationPass())
add!(fpm, AllocOptPass())
else
add!(fpm, InstCombinePass())
add!(fpm, EarlyCSEPass())
add!(fpm, InstCombinePass())
end
if LLVM.version() >= v"17"
add!(fpm, PeepholeCallbacks(; opt_level))
end
end
# TODO invokePeepholeCallbacks
end
add!(mpm, GlobalOptPass())
add!(mpm, GlobalDCEPass())
end

function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
add!(fpm, NewPMLoopPassManager()) do lpm
add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
add!(lpm, LowerSIMDLoopPass())
if opt_level >= 2
add!(lpm, LoopInstSimplifyPass())
add!(lpm, LoopSimplifyCFGPass())
# run LICM with AllowSpeculation=false before rotation to avoid
# speculating loads that rotation can hoist more precisely.
add!(lpm, LICMPass(; allowspeculation=false))
add!(lpm, JuliaLICMPass())
add!(lpm, LoopRotatePass())
end
# TODO invokeLateLoopOptimizationCallbacks
end
if opt_level >= 2
add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
add!(lpm, LICMPass())
add!(lpm, JuliaLICMPass())
add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true))
add!(lpm, LICMPass())
add!(lpm, JuliaLICMPass())
end
if LLVM.version() >= v"17"
add!(lpm, LateLoopOptimizationsCallbacks(; opt_level))
end
end
if opt_level >= 2
add!(fpm, IRCEPass())
end
add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...))
add!(fpm, InstCombinePass())
add!(fpm, NewPMLoopPassManager()) do lpm
if opt_level >= 2
add!(lpm, LoopInstSimplifyPass())
add!(lpm, LoopIdiomRecognizePass())
add!(lpm, IndVarSimplifyPass())
add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true))
add!(lpm, LoopDeletionPass())
add!(lpm, LoopFullUnrollPass())
end
# TODO invokeLoopOptimizerEndCallbacks
if LLVM.version() >= v"17"
add!(lpm, LoopOptimizerEndCallbacks(; opt_level))
end
end
end

function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
if opt_level >= 2
add!(fpm, AllocOptPass())
add!(fpm, SROAPass())
add!(fpm, InstSimplifyPass())
add!(fpm, VectorCombinePass())
add!(fpm, MergedLoadStoreMotionPass())
add!(fpm, GVNPass())
add!(fpm, MemCpyOptPass())
add!(fpm, SCCPPass())
add!(fpm, BDCEPass())
add!(fpm, InstCombinePass())
add!(fpm, CorrelatedValuePropagationPass())
add!(fpm, DCEPass())
add!(fpm, ADCEPass())
add!(fpm, MemCpyOptPass())
add!(fpm, DSEPass())
add!(fpm, IRCEPass())
add!(fpm, InstCombinePass())
add!(fpm, JumpThreadingPass())
add!(fpm, ConstraintEliminationPass())
elseif opt_level >= 1
add!(fpm, AllocOptPass())
add!(fpm, SROAPass())
add!(fpm, MemCpyOptPass())
add!(fpm, SCCPPass())
add!(fpm, BDCEPass())
add!(fpm, InstCombinePass())
add!(fpm, ADCEPass())
end
if opt_level >= 3
add!(fpm, GVNPass())
end
if opt_level >= 2
add!(fpm, DSEPass())
# TODO invokePeepholeCallbacks
if LLVM.version() >= v"17"
add!(fpm, PeepholeCallbacks(; opt_level))
end
add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
add!(fpm, AllocOptPass())
add!(fpm, NewPMLoopPassManager()) do lpm
add!(lpm, LoopDeletionPass())
add!(lpm, LoopInstSimplifyPass())
add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
add!(lpm, LICMPass())
add!(lpm, JuliaLICMPass())
end
add!(fpm, LoopDistributePass())
add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
add!(fpm, InstCombinePass())
elseif opt_level >= 1
add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
end
if LLVM.version() >= v"17"
add!(fpm, ScalarOptimizerLateCallbacks(; opt_level))
end
# TODO invokeScalarOptimizerCallbacks
end

function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
# re-rotate loops that might have been unrotated in the simplification above
add!(fpm, NewPMLoopPassManager()) do lpm
add!(lpm, LoopRotatePass())
add!(lpm, LoopDeletionPass())
end
add!(fpm, LoopDistributePass())
add!(fpm, InjectTLIMappings())
add!(fpm, LoopVectorizePass())
add!(fpm, LoopLoadEliminationPass())
add!(fpm, InstCombinePass())
add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
add!(lpm, LICMPass())
end
add!(fpm, EarlyCSEPass())
add!(fpm, CorrelatedValuePropagationPass())
add!(fpm, InstCombinePass())
add!(fpm, SLPVectorizerPass())
add!(fpm, VectorCombinePass())
# TODO invokeVectorizerCallbacks
add!(fpm, ADCEPass())
if LLVM.version() >= v"17"
add!(fpm, VectorizerStartCallbacks(; opt_level))
end
add!(fpm, LoopUnrollPass(; opt_level))
if LLVM.version() >= v"21"
add!(fpm, VectorizerEndCallbacks(; opt_level))
end
if LLVM.version() >= v"16"
add!(fpm, SROAPass(; preserve_cfg=true))
else
add!(fpm, SROAPass())
end
add!(fpm, InstSimplifyPass())
end

function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
Expand Down Expand Up @@ -272,13 +347,15 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
add!(fpm, DivRemPairsPass())
end
end
# TODO invokeOptimizerLastCallbacks
if LLVM.version() >= v"17"
add!(mpm, OptimizerLastCallbacks(; opt_level))
end
add!(mpm, NewPMFunctionPassManager()) do fpm
add!(fpm, AnnotationRemarksPass())
end
add!(mpm, NewPMFunctionPassManager()) do fpm
add!(fpm, DemoteFloat16Pass())
if opt_level >= 1
if opt_level >= 2
add!(fpm, GVNPass())
end
end
Expand Down
69 changes: 39 additions & 30 deletions src/ptx.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) =

function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
mod::LLVM.Module, entry::LLVM.Function)
# tell NVVMReflect whether to flush denormals; this mirrors what Clang does
# for `-fcuda-flush-denormals-to-zero` and is the only `__nvvm_reflect` key
# LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`.
flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] =
Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0)))

# emit the device capability and ptx isa version as constants in the module. this makes
# it possible to 'query' these in device code, relying on LLVM to optimize the checks
# away and generate static code. note that we only do so if there's actual uses of these
Expand Down Expand Up @@ -153,9 +159,14 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
tm = llvm_machine(job.config.target)
# TODO: Use the registered target passes (JuliaGPU/GPUCompiler.jl#450)
@dispose pb=NewPMPassBuilder() begin
register!(pb, NVVMReflectPass())

add!(pb, NVVMReflectPass())
if LLVM.version() < v"17"
# Pre-17 LLVM has no way to invoke EP callbacks from the string
# API, so fall back to our own nvvm_reflect! implementation.
# LLVM 17+ picks up NVPTX's built-in NVVMReflectPass through the
# PipelineStart EP invocations woven into `buildNewPMPipeline!`.
register!(pb, NVVMReflectPass())
add!(pb, NVVMReflectPass())
end

add!(pb, NewPMFunctionPassManager()) do fpm
# needed by GemmKernels.jl-like code
Expand Down Expand Up @@ -380,9 +391,12 @@ end

# Replace occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect with an integer.
#
# NOTE: this is the same as LLVM's NVVMReflect pass, which we cannot use because it is
# not exported. It is meant to be added to a pass pipeline automatically, by
# calling adjustPassManager, but we don't use a PassManagerBuilder so cannot do so.
# This is a back-port of LLVM's NVVMReflectPass for LLVM < 17, where the
# built-in pass cannot be invoked via the string-API PipelineStart EP callback.
# Semantics match LLVM's: `__CUDA_ARCH` is derived from the target capability,
# `__CUDA_FTZ` is read from the `nvvm-reflect-ftz` module flag, and every other
# key folds to 0. Knobs like denormal flushing or FMAD contraction must be
# configured through module flags or LLVM fast-math flags, not here.
const NVVM_REFLECT_FUNCTION = "__nvvm_reflect"
function nvvm_reflect!(mod::LLVM.Module)
job = current_job::CompilerJob
Expand All @@ -397,6 +411,18 @@ function nvvm_reflect!(mod::LLVM.Module)
reflect_typ = return_type(function_type(reflect_function))
isa(reflect_typ, LLVM.IntegerType) || error("_reflect's return type should be integer")

# pull __CUDA_FTZ from the nvvm-reflect-ftz module flag (same source LLVM uses)
ftz_val = 0
if haskey(flags(mod), "nvvm-reflect-ftz")
flag = flags(mod)["nvvm-reflect-ftz"]
if flag isa LLVM.ConstantAsMetadata
c = LLVM.Value(flag)
if c isa ConstantInt
ftz_val = Int(convert(Int64, c))
end
end
end

to_remove = []
for use in uses(reflect_function)
call = user(use)
Expand Down Expand Up @@ -440,31 +466,14 @@ function nvvm_reflect!(mod::LLVM.Module)
chars = convert.(Ref(UInt8), collect(sym_op))
reflect_arg = String(chars[1:end-1])

# handle possible cases
# XXX: put some of these property in the compiler job?
# and/or first set the "nvvm-reflect-*" module flag like Clang does?
fast_math = current_job.config.target.fastmath
# NOTE: we follow nvcc's --use_fast_math
reflect_val = if reflect_arg == "__CUDA_FTZ"
# single-precision denormals support
ConstantInt(reflect_typ, fast_math ? 1 : 0)
elseif reflect_arg == "__CUDA_PREC_DIV"
# single-precision floating-point division and reciprocals.
ConstantInt(reflect_typ, fast_math ? 0 : 1)
elseif reflect_arg == "__CUDA_PREC_SQRT"
# single-precision floating point square roots.
ConstantInt(reflect_typ, fast_math ? 0 : 1)
elseif reflect_arg == "__CUDA_FMAD"
# contraction of floating-point multiplies and adds/subtracts into
# floating-point multiply-add operations (FMAD, FFMA, or DFMA)
ConstantInt(reflect_typ, fast_math ? 1 : 0)
elseif reflect_arg == "__CUDA_ARCH"
ConstantInt(reflect_typ, job.config.target.cap.major*100 + job.config.target.cap.minor*10)
# match LLVM's NVVMReflectPass: unknown keys fold to 0.
reflect_val = if reflect_arg == "__CUDA_ARCH"
ConstantInt(reflect_typ,
job.config.target.cap.major*100 + job.config.target.cap.minor*10)
elseif reflect_arg == "__CUDA_FTZ"
ConstantInt(reflect_typ, ftz_val)
else
@safe_error """Unrecognized format of __nvvm_reflect call:
$(string(call))
Unknown argument $reflect_arg. Please file an issue."""
continue
ConstantInt(reflect_typ, 0)
end

replace_uses!(call, reflect_val)
Expand Down
Loading