PR1 :To understand how to read data, and apply sentence segmentation and word tokenization using Python and NLTK.

array_data = ["text", "Preprocessing", "in", "Python", "This", "is", "Practical", "one"]

for item in array_data:
    if item != "in" and item != "is":
        print(item)
with open("/content/SMSSpamCollection (1).bin", "r") as file:
    file_data = file.readline()

print(file_data)
import csv

with open("/content/TestLarge (1).csv", mode='r') as csv_file:
    csv_read = csv.reader(csv_file)
    headers = next(csv_read)
    csv_data = list(csv_read)

for row in csv_data:
    print(row)

import pandas as pd

data = pd.read_csv("/content/TestLarge (1).csv")
print(data)

print(data["Sentiment"][:5])       # First 5 sentiments
print(data["Sentiment"].head(4))   # First 4 using head()
print(data["Sentiment"].tail(4))   # Last 4 using tail()

import nltk

nltk.download('punkt')
nltk.download('punkt_tab')


corpus = "Natural Language Processing (NLP) is a subfield of computer science..."

def segmentation(text):
    segments = [nltk.sent_tokenize(text) for seg in text]
    return segments

print(segmentation(corpus))
print(nltk.sent_tokenize(corpus))


input_text = "Hello strudent's ! i am Harsh here , came late to class today. cause i had taffic problem sir"

def segment_and_tokenize(text):
    sentences = nltk.sent_tokenize(text)  # Break into sentences
    tokenize = [nltk.word_tokenize(sentence) for sentence in sentences]  # Break into words
    return sentences, tokenize

sentence_segments, word_tokenize = segment_and_tokenize(input_text)

print("Sentence Segments:")
for sent in sentence_segments:
    print(sent)

print("Word Tokens:")
for token in word_tokenize:
    print(token)

************************************************************************************