from nltk import word_tokenize train_docs = ['this is text number one', 'another text that i have'] train_docs = ' '.join(map(str, train_docs)) tokens = word_tokenize(train_docs) voc = {v: k for k, v in dict(enumerate(tokens)).items()}