<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Essay on emsenn.net</title>
    <link>https://emsenn.net/tags/essay/</link>
    <description>Recent content in Essay on emsenn.net</description>
    <generator>Hugo</generator>
    <language>en-us</language>
    <lastBuildDate>Sun, 08 Mar 2026 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://emsenn.net/tags/essay/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>The Two-Format Problem in Local Inference</title>
      <link>https://emsenn.net/library/domains/engineering/domains/tech/domains/computing/domains/on-device-inference/two-format-problem/</link>
      <pubDate>Sun, 08 Mar 2026 00:00:00 +0000</pubDate>
      <guid>https://emsenn.net/library/domains/engineering/domains/tech/domains/computing/domains/on-device-inference/two-format-problem/</guid>
      <description>&lt;p&gt;A user who wants to run &lt;a href=&#34;terms/large-language-model.md&#34; class=&#34;link-internal&#34;&gt;large language models&lt;/a&gt; locally on a machine with both a CPU and an &lt;a href=&#34;../../terms/neural-processing-unit.md&#34; class=&#34;link-internal&#34;&gt;NPU&lt;/a&gt; faces an awkward reality: the two processors require different model formats, and no practical conversion exists between them. This means downloading the same model twice, in two representations, to use both processors.&lt;/p&gt;&#xA;&lt;h2 id=&#34;two-paths-to-the-same-destination&#34;&gt;Two paths to the same destination&lt;/h2&gt;&#xA;&lt;p&gt;&lt;a href=&#34;../../../software/ollama/index.md&#34; class=&#34;link-internal&#34;&gt;Ollama&lt;/a&gt; — the dominant local inference tool — uses &lt;a href=&#34;terms/gguf.md&#34; class=&#34;link-internal&#34;&gt;GGUF&lt;/a&gt; format. GGUF was built for CPU inference: it stores &lt;a href=&#34;terms/quantization.md&#34; class=&#34;link-internal&#34;&gt;quantized&lt;/a&gt; weights in block patterns optimized for loading from system memory and processing on general-purpose cores. The entire llama.cpp ecosystem — and by extension, most of the local LLM community — speaks GGUF. When someone says they &amp;ldquo;downloaded a 7B model,&amp;rdquo; they almost certainly mean a GGUF file.&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
