dart_sentencepiece_tokenizer 1.1.0 copy "dart_sentencepiece_tokenizer: ^1.1.0" to clipboard
dart_sentencepiece_tokenizer: ^1.1.0 copied to clipboard

A lightweight, pure Dart implementation of SentencePiece tokenizer. Supports BPE (Gemma) and Unigram (Llama) algorithms.

example/example.dart

// ignore_for_file: avoid_print

import 'package:dart_sentencepiece_tokenizer/dart_sentencepiece_tokenizer.dart';

void main() async {
  // Load tokenizer from .model file
  final tokenizer = await SentencePieceTokenizer.fromModelFile(
    'tokenizer.model',
    config: SentencePieceConfig.llama, // BOS token only
  );

  // Basic encoding
  final encoding = tokenizer.encode('Hello, world!');
  print('Tokens: ${encoding.tokens}');
  print('IDs: ${encoding.ids}');
  print('Attention Mask: ${encoding.attentionMask}');

  // Decode back to text
  final decoded = tokenizer.decode(encoding.ids);
  print('Decoded: $decoded');

  // Sentence pair encoding (for QA, NLI tasks)
  final pairEncoding = tokenizer.encodePair(
    'What is machine learning?',
    'Machine learning is a subset of AI.',
  );
  print('Type IDs: ${pairEncoding.typeIds}'); // 0 for first, 1 for second

  // Batch encoding
  final texts = ['Hello', 'World', 'Dart'];
  final batchEncodings = tokenizer.encodeBatch(texts);
  for (var i = 0; i < texts.length; i++) {
    print('${texts[i]}: ${batchEncodings[i].ids}');
  }

  // Enable padding and truncation
  tokenizer
    ..enablePadding(length: 32, direction: SpPaddingDirection.right)
    ..enableTruncation(maxLength: 32);

  final paddedEncoding = tokenizer.encode('Short text');
  print('Padded length: ${paddedEncoding.length}'); // 32

  // Offset mapping
  final text = 'Hello world';
  final enc = tokenizer.encode(text, addSpecialTokens: false);
  for (var i = 0; i < enc.length; i++) {
    final offset = enc.offsets[i];
    print('Token "${enc.tokens[i]}" -> chars ${offset.$1}:${offset.$2}');
  }

  // Vocabulary access
  print('Vocab size: ${tokenizer.vocabSize}');
  print('BOS ID: ${tokenizer.vocab.bosId}');
  print('EOS ID: ${tokenizer.vocab.eosId}');
}
0
likes
160
points
150
downloads

Publisher

verified publisherbrodykim.work

Weekly Downloads

A lightweight, pure Dart implementation of SentencePiece tokenizer. Supports BPE (Gemma) and Unigram (Llama) algorithms.

Repository (GitHub)
View/report issues

Topics

#nlp #sentencepiece #tokenizer #machine-learning #llm

Documentation

API reference

License

MIT (license)

More

Packages that depend on dart_sentencepiece_tokenizer