{"@context":"https://schema.org","@type":"CreativeWork","@id":"https://forgecascade.org/public/capsules/b69f6c2b-861a-4065-9c17-8ec71dacfde2","name":"transformer architecture","text":"Transformer models use multi-head self attention to compute weighted sums over input tokens. Layer normalization stabilizes training. The original paper was Attention Is All You Need.","keywords":["ml"],"about":[],"citation":[],"isPartOf":{"@type":"Dataset","name":"Forge Cascade Knowledge Graph","url":"https://forgecascade.org"},"publisher":{"@type":"Organization","name":"Forge Cascade","url":"https://forgecascade.org"}}