Bugzilla – Attachment 170457 Details for
Bug 137470
Fix temporary URL in additionsdialog
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
Scraper script for the current daily-updated pseudo-API
scraper.py (text/x-python), 8.87 KB, created by
Muhammet Kara
on 2021-03-13 17:00:19 UTC
(
hide
)
Description:
Scraper script for the current daily-updated pseudo-API
Filename:
MIME Type:
Creator:
Muhammet Kara
Created:
2021-03-13 17:00:19 UTC
Size:
8.87 KB
patch
obsolete
>import requests >from bs4 import BeautifulSoup >import re >import json >import random > > >def replace_trash(unicode_string): > for i in range(0, len(unicode_string)): > try: > unicode_string[i].encode("ascii") > except: > # means it's non-ASCII > unicode_string = unicode_string[i].replace(" ") # replacing it with a single space > return unicode_string > >webpageURL = "https://extensions.libreoffice.org" > >class Release: > def __init__(self, releaseName="NULL", releaseDescription = "NULL", compatibility = "NULL", os="NULL", license = "NULL", notes = "NULL", downloadURL = "NULL"): > self.releaseName = releaseName > self.releaseDescription = releaseDescription > self.compatibility = compatibility > self.os = os > self.license = license > self.notes = notes > self.downloadURL = downloadURL > > def toJSON(self): > return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) > > >class Extension: > def __init__(self, id="NULL", name="NULL", author="NULL", url="NULL", screenshotURL="NULL", downloadNumber="NULL", commentNumber="NULL", commentURL="NULL", extensionIntroduction ="NULL", extensionDescription = "NULL", tags=[], rating="NULL", releases = []): > self.id = id > self.name = name > self.author = author > self.url = url > self.screenshotURL = screenshotURL > self.downloadNumber = downloadNumber > self.commentNumber = commentNumber > self.commentURL = commentURL > self.extensionIntroduction = extensionIntroduction > self.extensionDescription = extensionDescription > self.tags = tags > self.rating = rating > self.releases = releases > > def toJSON(self): > return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) > > def __eq__(self, obj): > return isinstance(obj, Extension) and obj.name == self.name > > >class Result: > def __init__(self, extensionCount = -1, extension = None): > self.extensionCount = extensionCount > self.extension = extension > > def toJSON(self): > return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) > >class Tag: > def __init__(self, id=-1,name="NULL", itemCount=-1): > self.id = id > self.name = name > self.itemCount = itemCount > >tagsArray = [] > >def loadTagsToArray(): > r = requests.get(webpageURL) > source = BeautifulSoup(r.content, "lxml") > tempTags = source.find_all("span", attrs={"class": "tag"}) # No Element -> None > for i in tempTags: > tagText = i.text > tagID = i['data-id'] > firstParenthesisIndex = tagText.find('(') > tagName = i.text[0:firstParenthesisIndex-1] > lastParenthesesIndex =tagText.find(')') > itemCount = tagText[firstParenthesisIndex+1:lastParenthesesIndex] > # Append names to tagsArray > tagsArray.append(Tag(tagID, tagName, itemCount)) > >loadTagsToArray() > >num = 0 >totalArray = [] >totalItemNumber = 0 >for i in tagsArray: > jsonArray = [] > f = open( "/usr/share/nginx/html/api/" + i.name + ".json", "w") > print("Tag: " + i.name) > #print(i.id + " " + i.name + " " + i.itemCount) > # Appending tag's id to url > currentStartItem = 0 # To use for pagination, increase by 30 > > # Calculating page count > if int(i.itemCount) % 30 == 0: > pageCount = int(i.itemCount) // 30 > else: > pageCount = int(i.itemCount) // 30 + 1 > > for j in range(pageCount): > currentURL = webpageURL + "/?Tags%5B%5D=" + i.id + "&start=" + str(currentStartItem) > r = requests.get(currentURL) > source = BeautifulSoup(r.content, "lxml") > extensionsList = source.find_all("div", attrs={"class": "box"}) > print("Page: " + str(j+1)) > for j in extensionsList: > # id alınacak -> url'in son /'dan sonrası > extensionName = j.find("h3").text > extensionName = re.sub(r'[^\x00-\x7F]+',' ', extensionName) > extensionName = re.sub(r'(?:\\u)[^\s]+',' ', extensionName) > > extensionAuthor = "NULL" > extensionURL = webpageURL + j.find("h3").find("a")['href'] > if j.find("div").find("a") != None and j.find("div").find("a").find("img") != None: > screenshotURL = webpageURL + j.find("div").find("a").find("img")['src'] > else: > screenshotURL = "NULL" > > downloadNumber = str(random.randint(1,30000)) > > reqOfExtensionLink = requests.get(extensionURL) > sourceOfExtensionLink =BeautifulSoup(reqOfExtensionLink.content, "lxml") > commentNumber = sourceOfExtensionLink.find("div", attrs={"id": "comments-holder"}).find("div", attrs={"class": "comments-holder"}).find("ul", attrs={"class": "comments-list root-level"}) > if commentNumber != None: > commentNumber = str(len(commentNumber.find_all("li"))) > else: > commentNumber = "0" > commentURL = extensionURL + "#comments-holder" > if sourceOfExtensionLink.find("p", attrs={"class": "intro"}) != None: > extensionIntro = sourceOfExtensionLink.find("p", attrs={"class": "intro"}).text > else: > extensionIntro = "NULL" > > extensionIntro = re.sub(r'[^\x00-\x7F]+',' ', extensionIntro) > extensionIntro = re.sub(r'(?:\\u)[^\s]+',' ', extensionIntro) > > > extensionDescription ="NULL" > > tagTemp = sourceOfExtensionLink.find("div", attrs={"class": "content-container unit size3of4 lastUnit"}).find_all("span", attrs={"class": "tag"}) > extensionTags = [] > for y in tagTemp: > extensionTags.append(y.text) > > ratingTemp = sourceOfExtensionLink.find("p", attrs={"class": "coments-ratings"}).text > > if ratingTemp.lstrip().rstrip() == "â â â â â": > extensionRating = "0" > elif ratingTemp.lstrip().rstrip() == "â â â â â": > extensionRating = "1" > elif ratingTemp.lstrip().rstrip() == "â â â â â": > extensionRating = "2" > elif ratingTemp.lstrip().rstrip() == "â â â â â": > extensionRating = "3" > elif ratingTemp.lstrip().rstrip() == "â â â â â": > extensionRating = "4" > elif ratingTemp.lstrip().rstrip() == "â â â â â ": > extensionRating = "5" > else: > extensionRating = "NaN" > > > > if sourceOfExtensionLink.find("div", attrs={"class": "releaseList"}) != None: > releaseListTemp = sourceOfExtensionLink.find("div", attrs={"class": "releaseList"}).find_all("li", attrs={"class": "releaseRow"}) > else: > continue # There is no release > > extensionReleases = [] > for tempRelease in releaseListTemp: > spans = tempRelease.find_all("span") > releaseVersion = spans[0].text > releaseVersion = re.sub(r'[^\x00-\x7F]+',' ', releaseVersion) > releaseVersion = re.sub(r'(?:\\u)[^\s]+',' ', releaseVersion) > releaseDescription = spans[1].text > releaseDescription = re.sub(r'[^\x00-\x7F]+',' ', releaseDescription) > releaseDescription = re.sub(r'(?:\\u)[^\s]+',' ', releaseDescription) > releaseCompatibility = spans[2].text > releaseOS = spans[3].text > releaseLicense = spans[4].text > releaseNotes = spans[5].text > releaseNotes = re.sub(r'[^\x00-\x7F]+',' ', releaseNotes) > releaseNotes = re.sub(r'(?:\\u)[^\s]+',' ', releaseNotes) > if spans[6].find("a") != None: > releaseDownloadURL = webpageURL + spans[6].find("a")['href'] > else: > releaseDownloadURL = "NULL" > extensionReleases.append(Release(releaseVersion, releaseDescription, releaseCompatibility, releaseOS, releaseLicense, releaseNotes, releaseDownloadURL)) > extensionID = extensionURL.split("/")[-1] > currentExtensionObject = Extension(extensionID, extensionName, extensionAuthor, extensionURL, screenshotURL, downloadNumber, commentNumber, commentURL, extensionIntro, extensionDescription, extensionTags, extensionRating,extensionReleases) > jsonArray.append(currentExtensionObject) > if currentExtensionObject not in totalArray: > totalArray.append(currentExtensionObject) > #print(jsonArray[-1].name) > > > > > # rating -> 0 > num += 1 > > currentStartItem += 30 > > resultObj = Result(i.itemCount, jsonArray) > totalItemNumber += int(i.itemCount) > f.write(resultObj.toJSON()) > > > >totalf = open( "/usr/share/nginx/html/api/allextensions.json", "w") >resultObjTotal = Result(totalItemNumber, totalArray) >totalf.write(resultObjTotal.toJSON()) >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 137470
: 170457