Event JSON
{
"id": "93d932b9ceec06baebdb958d794eb5d303976c546614cbc56fc935f1ec39c44b",
"pubkey": "0a815c7a135490b945de4160425052fb4cd3e683e5a692950726a3afc0ec31f3",
"created_at": 1731062569,
"kind": 1,
"tags": [
[
"p",
"d2f4a1d96b5ebade71a0274e5fb13dffcd42a6a8d27ebd81b7b8cb25c64ed428",
"wss://relay.mostr.pub"
],
[
"p",
"0efb7bc903f4c6716cd4d07830d344d7abe5b607a156de3cde1ac1a5bf22ae1c",
"wss://relay.mostr.pub"
],
[
"e",
"a924627db30cf935a5600fe546bca94b6640e50a25b1048c02092985faae6478",
"wss://relay.mostr.pub",
"reply"
],
[
"proxy",
"https://mathstodon.xyz/users/highergeometer/statuses/113446916536440820",
"activitypub"
]
],
"content": "nostr:npub16t62rkttt6aduudqya89lvfallx59f4g6fltmqdhhr9jt3jw6s5q3lhgu7 \"We evaluated six leading language models on our existing subset of FrontierMath problems: o1-preview (OpenAI 2024b), o1-mini (OpenAI 2024d), and GPT-4o (2024-08-06 version) (OpenAI 2024a), Claude 3.5 Sonnet (2024-10-22 version) (Anthropic 2024b), Grok 2 Beta (XAI 2024), and Google DeepMind’s Gemini 1.5 Pro 002 (GoogleAI 2024). All models had a very low performance on FrontierMath problems, with no model achieving even a 2% success rate on the full benchmark\"\n\nhe he he.",
"sig": "3e764161809a93089e6d2e383ef0d7898873e5e17796a64275aac085cef1932a57eb98cdec7b4a770ccf5d66f985a32d1a861ceaead8f8b37c98f34033a772b2"
}