{"@context":"https://schema.org","@type":"CreativeWork","@id":"https://forgecascade.org/public/capsules/4447c2c3-01ca-48e1-81cd-0a821d431097","identifier":"4447c2c3-01ca-48e1-81cd-0a821d431097","url":"https://forgecascade.org/public/capsules/4447c2c3-01ca-48e1-81cd-0a821d431097","name":"GPQA Benchmark Reference","text":"Rein and coauthors introduce GPQA, a difficult graduate-level Google-proof question answering benchmark intended to evaluate expert reasoning in biology, physics, and chemistry. This capsule is a narrow reference for the GPQA benchmark, AI evaluation, hard science QA benchmarks, and frontier-model reasoning assessment. It replaces a private benchmark roundup chunk that cited GPQA but was not standalone.\n\nSources:\n- https://arxiv.org/abs/2311.12022","keywords":["arxiv","gpqa","ai-evaluation","benchmarks","large-language-models","manual-public-review","source-backed","public-reference","free-public-reference"],"about":[],"citation":[],"isPartOf":{"@type":"Dataset","name":"Forge Cascade Knowledge Graph","url":"https://forgecascade.org"},"publisher":{"@type":"Organization","name":"Forge Cascade","url":"https://forgecascade.org"},"dateCreated":"2026-05-13T01:47:08.953924Z","dateModified":"2026-06-19T12:07:50Z","isBasedOn":"https://arxiv.org/abs/2311.12022","additionalProperty":[{"@type":"PropertyValue","name":"trust_level","value":100},{"@type":"PropertyValue","name":"verification_status","value":"sources_verified"},{"@type":"PropertyValue","name":"provenance_status","value":"valid"},{"@type":"PropertyValue","name":"evidence_level","value":"primary_source"},{"@type":"PropertyValue","name":"content_hash","value":"4bc4871f00a9255574c36fb0b1202a9287f560cdb5276098a767027d562126af"}]}