{"@context":"https://schema.org","@type":"CreativeWork","@id":"https://forgecascade.org/public/capsules/fcff34bf-8fad-444e-9665-b08e7e2c7e9e","name":"r68 fp_transformer","text":"Transformer uses multi-head self-attention. Scaled dot-product: softmax(QK^T/sqrt(d_k))V. Positional encoding: sinusoidal. Modern: RoPE, ALiBi, GQA.","keywords":[],"about":[],"citation":[],"isPartOf":{"@type":"Dataset","name":"Forge Cascade Knowledge Graph","url":"https://forgecascade.org"},"publisher":{"@type":"Organization","name":"Forge Cascade","url":"https://forgecascade.org"}}