#AIM: ADVANCED TOKENIZATION TECHNIQUES # Importing necessary libraries from transformers import BertTokenizer, GPT2Tokenizer, RobertaTokenizer import sentencepiece as spm import nltk nltk.download('punkt') from nltk.tokenize import word_tokenize # Sample text text = "Natural language processing (NLP) is a crucial technology for modern applications like chatbots, translation, and AI." # 1. Basic Word Tokenization (NLTK) print("Basic Word Tokenization (NLTK):") word_tokens = word_tokenize(text) print(word_tokens) # 2. Subword Tokenization (BERT's WordPiece Tokenizer) print("\nSubword Tokenization (BERT - WordPiece):") bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_tokens = bert_tokenizer.tokenize(text) print(bert_tokens) # 3. Byte Pair Encoding (BPE) with GPT-2 Tokenizer print("\nByte Pair Encoding (GPT-2):") gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') gpt2_tokens = gpt2_tokenizer.tokenize(text) print(gpt2_tokens) # 4. SentencePiece Tokenization (Pretrained on RoBERTa) print("\nSentencePiece Tokenization (RoBERTa):") roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base') roberta_tokens = roberta_tokenizer.tokenize(text) print(roberta_tokens) # 5. Train your own SentencePiece tokenizer (for custom data) print("\nCustom SentencePiece Tokenizer (Training):") # You would typically train on large text data, but here we simulate with small data sample_data = "Natural language processing is essential for modern AI applications." with open("sample_text.txt", "w") as f: f.write(sample_data) # Train SentencePiece model with smaller vocabulary size spm.SentencePieceTrainer.Train('--input=sample_text.txt --model_prefix=m --vocab_size=28') sp = spm.SentencePieceProcessor(model_file='m.model') # Tokenize using custom SentencePiece model sentencepiece_tokens = sp.encode_as_pieces(text) print(sentencepiece_tokens)