%%shell
cat > /etc/apt/sources.list.d/debian.list <<‘EOF’
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF
apt-key adv –keyserver keyserver.ubuntu.com –recv-keys DCC9EFBF77E11517
apt-key adv –keyserver keyserver.ubuntu.com –recv-keys 648ACFD622F3D138
apt-key adv –keyserver keyserver.ubuntu.com –recv-keys 112695A0E562B32A
apt-key export 77E11517 | gpg –dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg –dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg –dearmour -o /usr/share/keyrings/debian-security-buster.gpg
cat > /etc/apt/preferences.d/chromium.pref << ‘EOF’
Package: *
Pin: release a=eoan
Pin-Priority: 500
Package: *
Pin: origin “deb.debian.org”
Pin-Priority: 300
Package: chromium*
Pin: origin “deb.debian.org”
Pin-Priority: 700
EOF
!apt-get update
!apt-get install chromium chromium-driver
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
from selenium.webdriver.common.by import By
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def web_driver():
options = webdriver.ChromeOptions()
options.add_argument(“–verbose”)
options.add_argument(‘–no-sandbox’)
options.add_argument(‘–headless’)
options.add_argument(‘–disable-gpu’)
options.add_argument(“–window-size=1920, 1200”)
options.add_argument(‘–disable-dev-shm-usage’)
driver = webdriver.Chrome(options=options)
return driver
url = “https://www.imdb.com/title/tt3371366/reviews?ref_=tt_urv”
driver = web_driver()
driver.get(url)
while True:
try:
load_more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, “load-more-trigger”)))
load_more_button.click()
time.sleep(2) # wait for the reviews to load
except NoSuchElementException:
break
reviews = []
ratings = []
review_elements = driver.find_elements(By.CLASS_NAME, “lister-item-content”)
for review in review_elements:
try:
# extract the text and rating of the review
content = review.find_element(By.CLASS_NAME, “content”)
text = content.find_element(By.CLASS_NAME, “text”).text
rating = content.find_element(By.CLASS_NAME, “ipl-rating-star__rating”).text
reviews.append(text)
ratings.append(rating)
except NoSuchElementException:
# skip over the review if any element is not found
continue
reviews_list = []
for review, rating in zip(reviews, ratings):
try:
title = review_elements[reviews.index(review)].find_element(By.CLASS_NAME, “title”).text
reviews_list.append({“title”: title, “rating”: rating, “text”: review})
except NoSuchElementException:
continue
imdb_reviews = pd.DataFrame(reviews_list)
import os
directory = ‘drive/MyDrive/IMDB’
if not os.path.exists(directory):
os.makedirs(directory)
imdb_reviews.to_csv(‘drive/MyDrive/transformers_reviews.csv’)
import numpy as np
imdb_reviews[‘text’].replace(“”, np.nan, inplace=True)
imdb_reviews.dropna(inplace=True)
imdb_reviews[‘text’] = imdb_reviews[‘text’].str.lower()
spec_chars = [“±”,”@”,”#”,”$”,”%”,”^”,
“&”,”*”,”(“,”)”,”_”,”+”,”=”,
“-“,”/”,”>”,”<“,”?”,
“~”,”`”,”‘”,”[“,”]”,”|”,”}”,
“{“,’”‘, “.”,”,”,”!”,”;”]
for char in spec_chars:
imdb_reviews[“text”] = imdb_reviews[“text”].str.replace(char, “”)
imdb_reviews[‘text’] = imdb_reviews[‘text’].str.replace(‘\n’, “”)
imdb_reviews[“text”].apply(lambda x: x.encode(‘ascii’, ‘ignore’).decode(‘ascii’))
imdb_reviews[“text”] = imdb_reviews[“text”].str.replace(‘\d+’, “”) # Remove numbers using regex
imdb_reviews[“rating”] = imdb_reviews[“rating”].apply(lambda x: x.split(“/”)[0])
This code is a Python script for scraping reviews of a movie from the IMDB website using the Selenium and BeautifulSoup libraries. The reviews are saved in a CSV file and preprocessed using Numpy and Pandas.
The script starts by adding the Debian Buster repository and keys to the sources.list.d directory using cat and apt-key commands. The script then prefers the Debian repository for the chromium package and installs it using apt-get commands.
Next, the script imports the necessary libraries for scraping and pre-processing the reviews. It defines a function, web_driver(), to initialize a Selenium web driver with the required options. It then sets the URL of the movie reviews page and opens it using the web driver.
The script clicks on the “Load More” button to load all the reviews available on the page. It then extracts the text and rating of each review using Selenium and stores them in two lists, reviews and ratings.
The script then creates a list of dictionaries, reviews_list, to store the extracted reviews, ratings, and titles. It uses a loop to iterate over the reviews and ratings lists, extracts the title of each review, and appends a dictionary with the title, rating, and text to reviews_list.
The script converts the reviews_list to a Pandas DataFrame, imdb_reviews, and saves it to a CSV file using the to_csv method. It then preprocesses the reviews by removing special characters, newlines, and numbers using Numpy and Pandas methods.
Finally, the preprocessed reviews are saved to the ‘text’ column of the imdb_reviews DataFrame, which can be used for further analysis.