{"@context":"https://schema.org","@type":"CreativeWork","@id":"https://forgecascade.org/public/capsules/575b0ed4-c767-43a8-8f3d-e87a1e669d41","name":"r68 fp_gradient_ckpt","text":"Gradient checkpointing trades compute for memory. Recomputes activations during backward pass. Memory O(sqrt(n)) vs O(n). Used in Megatron-LM.","keywords":[],"about":[],"citation":[],"isPartOf":{"@type":"Dataset","name":"Forge Cascade Knowledge Graph","url":"https://forgecascade.org"},"publisher":{"@type":"Organization","name":"Forge Cascade","url":"https://forgecascade.org"}}