Pyton skripta

Pyton skripta

offline
  • Pridružio: 18 Jul 2003
  • Poruke: 4194
  • Gde živiš: U zlatnom kavezu

Imam payton skriptu koj ne radi sasvim ok, a ja ne mogu da joj nadjem gresku. Ona treba da generise slektovane recenzije hotela u csv fajl i ona do uradi, ali je problem sto bez obzira na broj selekcija ona obradi samo pet. Ako neko zna neka pomogne prilazem kod

# !/usr/bin/python # -*- coding: utf-8 -*- # importing libraries from bs4 import BeautifulSoup import urllib, csv, os, datetime, urllib.request, re, sys # creating CSV file to be used file = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews.csv"), "wb") file.write(     b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count"      + b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count" + b"," +  b"Rating Date" + b"," + b"Rating" + b"\n") # List the first page of the reviews (ends with "#REVIEWS") - separate the websites with , WebSites = [     "https://www.tripadvisor.com/Hotel_Review-g294472-d7181993-Reviews-Holiday_Inn_Express_Belgrade_City-Belgrade.html" ] Checker = "REVIEWS" # looping through each site until it hits a break for theurl in WebSites:     thepage = urllib.request.urlopen(theurl)     soup = BeautifulSoup(thepage, 'html.parser')     while True:         # extract the help count, restaurant review count, attraction review count and hotel review count         a = b = 0         helpcountarray = restaurantarray = attractionarray = hotelarray = ""         for profile in soup.findAll(attrs={"class": "memberBadgingNoText"}):             textFromHt = profile.findAll(text=True)             image = '\n'.join(textFromHt)             image = image.replace("\n", "|||||").strip()             # print "".join(profile.findAll(text=True))             if image.find("helpful") > 0:                 counter = image.split("helpful", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()                 if len(helpcountarray) == 0:                     helpcountarray = [counter]                 else:                     helpcountarray.append(counter)             elif image.find("helpful") < 0:                 if len(helpcountarray) == 0:                     helpcountarray = ["0"]                 else:                     helpcountarray.append("0")             if image.find("attraction") > 0:                 counter = image.split("attraction", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()                 if len(attractionarray) == 0:                     attractionarray = [counter]                 else:                     attractionarray.append(counter)             elif image.find("attraction") < 0:                 if len(attractionarray) == 0:                     attractionarray = ["0"]                 else:                     attractionarray.append("0")             if image.find("REVIEWS_RESTAURANTS") > 0:                 counter = image.split("REVIEWS_RESTAURANTS", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()                 if len(restaurantarray) == 0:                     restaurantarray = [0]                 else:                     restaurantarray.append("0")             elif image.find("REVIEWS_RESTAURANTS") < 0:                 if len(restaurantarray) == 0:                     restaurantarray = ["0"]                 else:                     restaurantarray.append("0")             if image.find("REVIEWS_HOTELS") > 0:                 counter = image.split("REVIEWS_HOTELS", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()                 if len(hotelarray) == 0:                     hotelarray = [counter]                 else:                     hotelarray.append(counter)             elif image.find("REVIEWS_HOTELS") < 0:                 if len(hotelarray) == 0:                     hotelarray = ["0"]                 else:                     hotelarray.append("0")                     # extract the rating count for each user review         #altarray = ""         #for rating in soup.findAll(attrs={"class": "rating reviewItemInline"}):          #   alt = rating.find('img', alt=True)          #   if alt[-5:] == '':           #      if len(altarray) == 0:           #          altarray = [alt]        # else:             # noinspection PyUnboundLocalVariable         #    altarray.append(alt)         Organization = soup.find(attrs={'class': 'heading_title'                                         }).text.replace('"', ' ').replace('Review of', ' ').strip()         Address = soup.findAll(attrs={'class': 'address_search'                                       })[0].text.replace(',', '').replace('\n', ''                                                                           ).strip()         # Loop through each review on the page         for x in range(0, len(hotelarray)):             # noinspection PyBroadException             try:                 Reviewer = soup.findAll(attrs={"class": "username mo"})[x].text             except:                 Reviewer = "N/A"                 continue             Reviewer = Reviewer.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip()             ReviewTitle = soup.findAll(attrs={"class": "quote"})[x].text.replace(',', ' ').replace('”', '').replace('“',                                                                                                                     '').replace(                 '"', '').replace('é', 'e').strip()             Review = soup.findAll(attrs={"class": "entry"})[x].text.replace(',', ' ').replace('\n', ' ').strip()             RatingDate = soup.findAll(attrs={"class": "ratingDate"})[x].text.replace('Reviewed', ' ').replace('NEW',                                                                                                               ' ').replace(                 ',', ' ').strip()             # Rating = altarray[x][:1]             HelpCount = helpcountarray[x]             AttractionCount = attractionarray[x]             Restaurant = restaurantarray[x]             Hotel = hotelarray[x]             Record = Organization + "," + Address + "," + Reviewer + "," + ReviewTitle + "," + Review + "," + "," + HelpCount + "," + AttractionCount + "," + Restaurant + "," + Hotel + ","  + RatingDate + ","             if Checker == "REVIEWS":                 file.write(bytes(Record, encoding="ascii", errors='ignore') + b"\n")         link = soup.find_all(attrs={"class": "nav next taLnk"})         print(Organization)         if len(link) == 0:             break         else:             soup = BeautifulSoup(urllib.request.urlopen('http://www.tripadvisor.com'                                                         + link[0].get('href')), 'html.parser')             print(link[0].get('href'))             Checker = link[0].get('href')[-7:] file.close()



Registruj se da bi učestvovao u diskusiji. Registrovanim korisnicima se NE prikazuju reklame unutar poruka.
Ko je trenutno na forumu
 

Ukupno su 608 korisnika na forumu :: 12 registrovanih, 1 sakriven i 595 gosta   ::   [ Administrator ] [ Supermoderator ] [ Moderator ] :: Detaljnije

Najviše korisnika na forumu ikad bilo je 1567 - dana 15 Jul 2016 19:18

Korisnici koji su trenutno na forumu:
Korisnici trenutno na forumu: Atomski čoban, bobanrakidjic, comi_pfc, GreenMan, Ivan Gajic, ivica976, jovan.simovic97, Kubovac, ltcolonel, Mugy, SERBIAN98, tmanda323