{
  "_id": "6a1f1a94b401979e7341f1d0",
  "Package": "tok",
  "Title": "Fast Text Tokenization",
  "Version": "0.2.2",
  "Authors@R": "c(\nperson(\"Tomasz\", \"Kalinowski\", , \"tomasz@posit.co\", c(\"ctb\", \"cre\")),\nperson(\"Daniel\", \"Falbel\", , \"dfalbel@gmail.com\", c(\"aut\")),\nperson(\"Regouby\", \"Christophe\", , \"christophe.regouby@free.fr\", c(\"ctb\")),\nperson(family = \"Posit\", role = c(\"cph\"))\n)",
  "Description": "Interfaces with the 'Hugging Face' tokenizers library to\nprovide implementations of today's most used tokenizers such as\nthe 'Byte-Pair Encoding' algorithm\n<https://huggingface.co/docs/tokenizers/index>. It's extremely\nfast for both training new vocabularies and tokenizing texts.",
  "License": "MIT + file LICENSE",
  "SystemRequirements": "Cargo (Rust's package manager), rustc >= 1.91",
  "Encoding": "UTF-8",
  "RoxygenNote": "7.3.3",
  "Config/testthat/edition": "3",
  "URL": "https://github.com/mlverse/tok",
  "BugReports": "https://github.com/mlverse/tok/issues",
  "Config/rextendr/version": "0.5.0",
  "NeedsCompilation": "yes",
  "Packaged": {
    "Date": "2026-05-16 09:34:58 UTC",
    "User": "root"
  },
  "Author": "Tomasz Kalinowski [ctb, cre], Daniel Falbel [aut], Regouby\nChristophe [ctb], Posit [cph]",
  "Maintainer": "Tomasz Kalinowski <tomasz@posit.co>",
  "Config/pak/sysreqs": "libclang-dev",
  "Repository": "https://cranhaven.r-universe.dev",
  "Date/Publication": "2026-05-15 03:02:00 UTC",
  "RemoteUrl": "https://github.com/cranhaven/cranhaven.r-universe.dev",
  "RemoteRef": "package/tok",
  "RemoteSha": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
  "RemoteSubdir": "tok",
  "MD5sum": "fa7d8c3d24510a0e2f330a237589bb63",
  "_user": "cranhaven",
  "_type": "src",
  "_file": "tok_0.2.2.tar.gz",
  "_fileid": "9043ea817fae1e59b779124feb2bf7937e9bc8b895f5c46ff2872581201b2725",
  "_filesize": 8277945,
  "_sha256": "9043ea817fae1e59b779124feb2bf7937e9bc8b895f5c46ff2872581201b2725",
  "_created": "2026-05-16T09:34:58.000Z",
  "_published": "2026-06-02T18:01:56.136Z",
  "_distro": "noble",
  "_jobs": [
    {
      "job": 79138122544,
      "time": 233,
      "config": "linux-devel-arm64",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7032385405"
    },
    {
      "job": 79138122546,
      "time": 220,
      "config": "linux-devel-x86_64",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7032383705"
    },
    {
      "job": 79138122640,
      "time": 229,
      "config": "linux-release-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7032384919"
    },
    {
      "job": 79138122455,
      "time": 219,
      "config": "linux-release-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7032383822"
    },
    {
      "job": 79138122501,
      "time": 210,
      "config": "macos-oldrel-arm64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7032383636"
    },
    {
      "job": 79138122551,
      "time": 374,
      "config": "macos-oldrel-x86_64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7032399845"
    },
    {
      "job": 79138122573,
      "time": 180,
      "config": "macos-release-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7032379382"
    },
    {
      "job": 79138122580,
      "time": 381,
      "config": "macos-release-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7032401179"
    },
    {
      "job": 79138122047,
      "time": 273,
      "config": "source",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7032359158"
    },
    {
      "job": 79138121698,
      "time": 134,
      "config": "wasm-release",
      "r": "4.6.0",
      "check": "FAIL",
      "artifact": ""
    },
    {
      "job": 79138122090,
      "time": 363,
      "config": "windows-devel",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7032398700"
    },
    {
      "job": 79138122481,
      "time": 312,
      "config": "windows-oldrel",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7032393498"
    },
    {
      "job": 79138122204,
      "time": 382,
      "config": "windows-release",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7032400640"
    }
  ],
  "_buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340",
  "_status": "success",
  "_host": "GitHub-Actions",
  "_upstream": "https://github.com/cranhaven/cranhaven.r-universe.dev",
  "_commit": {
    "id": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
    "author": "GitHub Action <action@github.com>",
    "committer": "GitHub Action <action@github.com>",
    "message": "Add tok to CRANhaven, because archived on 2026-05-15 03:02:00 +0000\n",
    "time": 1778814120
  },
  "_maintainer": {
    "name": "Tomasz Kalinowski",
    "email": "tomasz@posit.co",
    "login": "t-kalinowski",
    "mastodon": "@t_kalinowski@fosstodon.org",
    "bluesky": "@t-kalinowski.bsky.social",
    "uuid": 8462255
  },
  "_registered": true,
  "_dependencies": [
    {
      "package": "R",
      "version": ">= 4.2.0",
      "role": "Depends"
    },
    {
      "package": "R6",
      "role": "Imports"
    },
    {
      "package": "cli",
      "role": "Imports"
    },
    {
      "package": "rmarkdown",
      "role": "Suggests"
    },
    {
      "package": "testthat",
      "version": ">= 3.0.0",
      "role": "Suggests"
    },
    {
      "package": "hfhub",
      "version": ">= 0.1.1",
      "role": "Suggests"
    },
    {
      "package": "withr",
      "role": "Suggests"
    }
  ],
  "_owner": "cranhaven",
  "_selfowned": true,
  "_usedby": 1,
  "_updates": [
    {
      "week": "2026-20",
      "n": 1
    }
  ],
  "_tags": [],
  "_topics": [
    "archived",
    "packages",
    "r-universe",
    "rust",
    "cargo"
  ],
  "_stars": 5,
  "_contributors": [
    {
      "user": "henrikbengtsson",
      "count": 183,
      "uuid": 1616850
    }
  ],
  "_userbio": {
    "uuid": 161993010,
    "type": "organization",
    "name": "cranhaven",
    "description": "CRANhaven - Repository for Recently Archived CRAN Packages"
  },
  "_downloads": {
    "count": 13203,
    "source": "https://cranlogs.r-pkg.org/downloads/total/last-month/tok"
  },
  "_devurl": "https://github.com/mlverse/tok",
  "_searchresults": 15,
  "_cargo": true,
  "_rbuild": "4.6.0",
  "_assets": [
    "extra/citation.cff",
    "extra/citation.html",
    "extra/citation.json",
    "extra/citation.txt",
    "extra/contents.json",
    "extra/NEWS.html",
    "extra/NEWS.txt",
    "extra/readme.html",
    "extra/readme.md",
    "extra/tok.html",
    "manual.pdf"
  ],
  "_homeurl": "https://github.com/mlverse/tok",
  "_realowner": "mlverse",
  "_cranurl": false,
  "_releases": [
    {
      "version": "0.1.0",
      "date": "2023-07-06"
    },
    {
      "version": "0.1.1",
      "date": "2023-08-18"
    },
    {
      "version": "0.1.2",
      "date": "2024-06-27"
    },
    {
      "version": "0.1.3",
      "date": "2024-07-06"
    },
    {
      "version": "0.1.4",
      "date": "2024-09-04"
    },
    {
      "version": "0.2.0",
      "date": "2025-08-27"
    },
    {
      "version": "0.2.1",
      "date": "2025-10-03"
    },
    {
      "version": "0.2.2",
      "date": "2026-04-22"
    }
  ],
  "_exports": [
    "decoder_byte_level",
    "encoding",
    "model_bpe",
    "model_unigram",
    "model_wordpiece",
    "normalizer_nfc",
    "normalizer_nfkc",
    "pre_tokenizer",
    "pre_tokenizer_byte_level",
    "pre_tokenizer_whitespace",
    "processor_byte_level",
    "tok_decoder",
    "tok_model",
    "tok_normalizer",
    "tok_processor",
    "tok_trainer",
    "tokenizer",
    "trainer_bpe",
    "trainer_unigram",
    "trainer_wordpiece"
  ],
  "_help": [
    {
      "page": "decoder_byte_level",
      "title": "Byte level decoder",
      "concept": [
        "decoders"
      ],
      "topics": [
        "decoder_byte_level"
      ]
    },
    {
      "page": "encoding",
      "title": "Encoding",
      "topics": [
        "encoding"
      ]
    },
    {
      "page": "model_bpe",
      "title": "BPE model",
      "concept": [
        "model"
      ],
      "topics": [
        "model_bpe"
      ]
    },
    {
      "page": "model_unigram",
      "title": "An implementation of the Unigram algorithm",
      "concept": [
        "model"
      ],
      "topics": [
        "model_unigram"
      ]
    },
    {
      "page": "model_wordpiece",
      "title": "An implementation of the WordPiece algorithm",
      "concept": [
        "model"
      ],
      "topics": [
        "model_wordpiece"
      ]
    },
    {
      "page": "normalizer_nfc",
      "title": "NFC normalizer",
      "concept": [
        "normalizers"
      ],
      "topics": [
        "normalizer_nfc"
      ]
    },
    {
      "page": "normalizer_nfkc",
      "title": "NFKC normalizer",
      "concept": [
        "normalizers"
      ],
      "topics": [
        "normalizer_nfkc"
      ]
    },
    {
      "page": "pre_tokenizer",
      "title": "Generic class for tokenizers",
      "concept": [
        "pre_tokenizer"
      ],
      "topics": [
        "pre_tokenizer"
      ]
    },
    {
      "page": "pre_tokenizer_byte_level",
      "title": "Byte level pre tokenizer",
      "concept": [
        "pre_tokenizer"
      ],
      "topics": [
        "pre_tokenizer_byte_level"
      ]
    },
    {
      "page": "pre_tokenizer_whitespace",
      "title": "This pre-tokenizer simply splits using the following regex: \\w+|[^\\w\\s]+",
      "concept": [
        "pre_tokenizer"
      ],
      "topics": [
        "pre_tokenizer_whitespace"
      ]
    },
    {
      "page": "processor_byte_level",
      "title": "Byte Level post processor",
      "concept": [
        "processors"
      ],
      "topics": [
        "processor_byte_level"
      ]
    },
    {
      "page": "tok_decoder",
      "title": "Generic class for decoders",
      "concept": [
        "decoders"
      ],
      "topics": [
        "tok_decoder"
      ]
    },
    {
      "page": "tok_model",
      "title": "Generic class for tokenization models",
      "concept": [
        "model"
      ],
      "topics": [
        "tok_model"
      ]
    },
    {
      "page": "tok_normalizer",
      "title": "Generic class for normalizers",
      "concept": [
        "normalizers"
      ],
      "topics": [
        "tok_normalizer"
      ]
    },
    {
      "page": "tok_processor",
      "title": "Generic class for processors",
      "concept": [
        "processors"
      ],
      "topics": [
        "tok_processor"
      ]
    },
    {
      "page": "tok_trainer",
      "title": "Generic training class",
      "concept": [
        "trainer"
      ],
      "topics": [
        "tok_trainer"
      ]
    },
    {
      "page": "tokenizer",
      "title": "Tokenizer",
      "topics": [
        "tokenizer"
      ]
    },
    {
      "page": "trainer_bpe",
      "title": "BPE trainer",
      "concept": [
        "trainer"
      ],
      "topics": [
        "trainer_bpe"
      ]
    },
    {
      "page": "trainer_unigram",
      "title": "Unigram tokenizer trainer",
      "concept": [
        "trainer"
      ],
      "topics": [
        "trainer_unigram"
      ]
    },
    {
      "page": "trainer_wordpiece",
      "title": "WordPiece tokenizer trainer",
      "concept": [
        "trainer"
      ],
      "topics": [
        "trainer_wordpiece"
      ]
    }
  ],
  "_readme": "https://github.com/cranhaven/cranhaven.r-universe.dev/raw/package/tok/tok/README.md",
  "_rundeps": [
    "cli",
    "R6"
  ],
  "_score": 4.171825145729989,
  "_indexed": false,
  "_nocasepkg": "tok",
  "_universes": [
    "cranhaven"
  ],
  "_indexurl": "https://mlverse.r-universe.dev/tok",
  "_binaries": [
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "0.2.2",
      "date": "2026-05-16T09:39:02.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "0696bc546cd98619bfdb0a5576b035a0ad24fbee8916eb94ff8a1149ebe8d1f9",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "0.2.2",
      "date": "2026-05-16T09:38:46.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "e791daad59902cfa9e8f4b5623d514670650395cfed4f4c5d99621e4591ed327",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.2.2",
      "date": "2026-05-16T09:38:52.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "4f229026a111df74a416fca37e9859a5f1cf3e45cbc2ad4f2efddedbb788fca6",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.2.2",
      "date": "2026-05-16T09:38:51.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "482ceca45085834ee2b9e087a95b16b430afc71b00e8756e50a85df61d8ef7f3",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.2.2",
      "date": "2026-05-16T09:38:30.000Z",
      "arch": "aarch64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "a31d28bd75ddf261d8c0a6fa3ef05c0c5bbeb6d8a87058d3f03712309345446e",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.2.2",
      "date": "2026-05-16T09:39:24.000Z",
      "arch": "x86_64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "bd426abf2682d6e2f7623c47616489c6aa1458383347c793bf7f63b68f546297",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.2.2",
      "date": "2026-05-16T09:38:08.000Z",
      "arch": "aarch64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "5e6147076aa858f1b492c52bc5f85dbaff4667f43d3ed7a174cd8e1e20dbc471",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.2.2",
      "date": "2026-05-16T09:39:46.000Z",
      "arch": "x86_64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "cacae43deef22e69db2b02fa8772e0706f01803d950f511391553e905903ad2e",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.7.0",
      "os": "win",
      "version": "0.2.2",
      "date": "2026-05-16T09:38:54.000Z",
      "arch": "x86_64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "ec4057a7bbb81e0d3f09a51b896a947d7e9be6493d00a044ec1c613156c14c7e",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.5.3",
      "os": "win",
      "version": "0.2.2",
      "date": "2026-05-16T09:38:05.000Z",
      "arch": "x86_64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "5691ee195b1dd590dcb22b30193b0e8664f9ee111fed3de35bb2f84aa77cb90e",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    },
    {
      "r": "4.6.0",
      "os": "win",
      "version": "0.2.2",
      "date": "2026-05-16T09:38:08.000Z",
      "arch": "x86_64",
      "commit": "b4a2c37fdb54bbe4ab87ce8e16cb644ead04e7ca",
      "fileid": "9ef70a6ccdd73d6e95521f66d42cc9dd541e716c58ab279046231c687258fd7d",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/cranhaven/actions/runs/25958530340"
    }
  ]
}