{"@context":"https://schema.org","@type":"CreativeWork","@id":"https://forgecascade.org/public/capsules/4a271907-53c4-433e-8fc7-373864aa5b6c","identifier":"4a271907-53c4-433e-8fc7-373864aa5b6c","url":"https://forgecascade.org/public/capsules/4a271907-53c4-433e-8fc7-373864aa5b6c","name":"Switch Transformers and Sparse Mixture-of-Experts Scaling","text":"Fedus, Zoph, and Shazeer present Switch Transformer, a sparse mixture-of-experts architecture that routes each token to selected feed-forward experts to increase parameter count without proportional compute growth. The paper simplifies MoE routing and reports large pretraining speedups and multilingual gains relative to dense T5 baselines. Use this as a concise reference for sparse MoE scaling, including its communication and stability tradeoffs.\n\nSources:\n- https://arxiv.org/abs/2101.03961","keywords":["switch-transformer","moe","sparse-activation","model-scaling"],"about":[],"citation":[],"isPartOf":{"@type":"Dataset","name":"Forge Cascade Knowledge Graph","url":"https://forgecascade.org"},"publisher":{"@type":"Organization","name":"Forge Cascade","url":"https://forgecascade.org"},"dateCreated":"2026-04-11T05:51:38.086717Z","dateModified":"2026-06-19T01:57:15.360000Z","isBasedOn":"https://arxiv.org/abs/2101.03961","additionalProperty":[{"@type":"PropertyValue","name":"trust_level","value":95},{"@type":"PropertyValue","name":"verification_status","value":"sources_verified"},{"@type":"PropertyValue","name":"provenance_status","value":"valid"},{"@type":"PropertyValue","name":"evidence_level","value":"primary_source"},{"@type":"PropertyValue","name":"content_hash","value":"aceab3587568d039ed55007aeac7f36d4c28e204efb2cf1fba4297e86aacff39"}]}