Notes
![]() ![]() Notes - notes.io |
class Sitemap_lib
{
public $scanned, $deferredLinks, $file_stream, $freq, $priority, $enable_priority, $enable_frequency, $max_depth, $depth, $real_site, $indexed;
public $debug;
public $curl_validate_certificate, $curl_client, $index_pdf, $crawler_user_agent, $enable_modified;
public $blacklist;
public $ignore_arguments;
public $site,$start,$tempfile,$file,$permissions,$index_img,$xmlheader,$modified;
public $currentPage;
public function __construct()
{
include 'config.php';
$this->curl_client = curl_init();
$this->start = microtime(true);
$this->tempfile = tempnam(sys_get_temp_dir(), 'sitemap.xml.');
$this->file_stream = fopen($this->tempfile, "w") or die("Error: Could not create temporary file $this->tempfile" . "n");
$this->depth = 0;
$this->indexed = 0;
$this->scanned = array();
$this->deferredLinks = array();
$this->site = base_default_url;
$this->real_site = $this->domain_root($this->site);
$this->file = "sitemap.xml";
$this->permissions = 0644;
$this->max_depth = 0;
$this->enable_frequency = TRUE;
$this->enable_priority = TRUE;
$this->freq = "daily";
$this->priority = "1";
$this->enable_modified = true;
$this->curl_validate_certificate = true;
$this->blacklist = array("*.jpg", "*/secrets/*", "https://www.knyz.org/supersecret", "*/?replytocom=*", "*/www.*.com/*");
$this->ignore_arguments = false;
$this->index_img = false;
$this->index_pdf = true;
$this->crawler_user_agent = "Mozilla/5.0 (compatible; Sitemap Generator Crawler; +https://github.com/knyzorg/Sitemap-Generator-Crawler)";
$this->xmlheader ='<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">';
$this->debug = array("add" => true, "rej" => false, "war" => false);
}
public function __get($var)
{
return get_instance()->$var;
}
public function logger($message, $type)
{
switch ($type) {
case 0:
echo $this->debug["add"] ? $message."rn".str_pad("", 1024, " "): "";
break;
case 1:
echo $this->debug["rej"] ? $message.PHP_EOL : "";
break;
case 2:
echo $this->debug["war"] ? $message.PHP_EOL : "";
break;
case 3:
echo "[!] $messagen";
break;
}
}
public function flatten_url($url)
{
$path = explode($this->real_site, $url)[1];
return $this->real_site . $this->remove_dot_seg($path);
}
public function remove_dot_seg($path)
{
if (strpos($path, '.') === false) {
return $path;
}
$inputBuffer = $path;
$outputStack = [];
while ($inputBuffer != '') {
if (strpos($inputBuffer, "./") === 0) {
$inputBuffer = substr($inputBuffer, 2);
continue;
}
if (strpos($inputBuffer, "../") === 0) {
$inputBuffer = substr($inputBuffer, 3);
continue;
}
if ($inputBuffer === "/.") {
$outputStack[] = '/';
break;
}
if (substr($inputBuffer, 0, 3) === "/./") {
$inputBuffer = substr($inputBuffer, 2);
continue;
}
if ($inputBuffer === "/..") {
array_pop($outputStack);
$outputStack[] = '/';
break;
}
if (substr($inputBuffer, 0, 4) === "/../") {
array_pop($outputStack);
$inputBuffer = substr($inputBuffer, 3);
continue;
}
if ($inputBuffer === '.' || $inputBuffer === '..') {
break;
}
if (($slashPos = stripos($inputBuffer, '/', 1)) === false) {
$outputStack[] = $inputBuffer;
break;
} else {
$outputStack[] = substr($inputBuffer, 0, $slashPos);
$inputBuffer = substr($inputBuffer, $slashPos);
}
}
return ltrim(implode($outputStack), "/");
}
public function is_scanned($url)
{
if (isset($this->scanned[$url])) {
return true;
}
//Check if in array as dir and non-dir
$url = $this->ends_with($url, "/") ? substr($url, 0, -1) : $url . "/";
if (isset($this->scanned[$url])) {
return true;
}
return false;
}
public function ends_with($haystack, $needle)
{
$length = strlen($needle);
if ($length == 0) {
return true;
}
return (substr($haystack, -$length) === $needle);
}
public function get_path($path)
{
$path_depth = explode("/", $path);
$len = strlen($path_depth[count($path_depth) - 1]);
return (substr($path, 0, strlen($path) - $len));
}
public function domain_root($href)
{
$url_parts = explode('/', $href);
return $url_parts[0] . '//' . $url_parts[2] . '/';
}
public function get_data($url)
{
curl_setopt($this->curl_client, CURLOPT_URL, $url);
//Follow redirects and get new url
curl_setopt($this->curl_client, CURLOPT_RETURNTRANSFER, 1);
//Get headers
curl_setopt($this->curl_client, CURLOPT_HEADER, 1);
//Optionally avoid validating SSL
curl_setopt($this->curl_client, CURLOPT_SSL_VERIFYPEER, $this->curl_validate_certificate);
//Set user agent
curl_setopt($this->curl_client, CURLOPT_USERAGENT, $this->crawler_user_agent);
//Get data
$data = curl_exec($this->curl_client);
$content_type = curl_getinfo($this->curl_client, CURLINFO_CONTENT_TYPE);
$http_code = curl_getinfo($this->curl_client, CURLINFO_HTTP_CODE);
$redirect_url = curl_getinfo($this->curl_client, CURLINFO_REDIRECT_URL);
//Scan new url, if redirect
if ($redirect_url) {
$this->logger("URL is a redirect.", 1);
if (strpos($redirect_url, '?') !== false) {
$redirect_url = explode($redirect_url, "?")[0];
}
unset($url, $data);
if (!$this->check_blacklist($redirect_url)) {
echo $this->logger("Redirected URL is in blacklist", 1);
} else {
$this->scan_url($redirect_url);
}
}
//If content acceptable, return it. If not, `false`
$html = ($http_code != 200 || (!stripos($content_type, "html"))) ? false : $data;
//Additional data
if ($this->enable_modified){
curl_setopt($this->curl_client, CURLOPT_FILETIME, true);
$timestamp = curl_getinfo($this->curl_client, CURLINFO_FILETIME);
$this->modified = ($timestamp != -1) ? date('c', $timestamp) : null;
}
else $this->modified = null;
if (stripos($content_type, "application/pdf") !== false && $this->index_pdf) {
$html = "This is a PDF";
}
//Return it as an array
return array($html, $this->modified, (stripos($content_type, "image/") && $this->index_img));
}
public function check_blacklist($string)
{
if (is_array($this->blacklist)) {
foreach ($this->blacklist as $illegal) {
if (fnmatch($illegal, $string)) {
return false;
}
}
}
return true;
}
public function get_links($html, $parent_url, $regexp)
{
if (preg_match_all("/$regexp/siU", $html, $matches)) {
if ($matches[2]) {
$found = array_map(function ($href) use (&$parent_url) {
$this->logger("Checking $href", 2);
if (strpos($href, "#") !== false) {
$this->logger("Dropping pound.", 2);
$href = preg_replace('/#.*/', '', $href);
}
//Seperate $href from $query_string
$query_string = '';
if (strpos($href, '?') !== false) {
list($href, $query_string) = explode('?', $href);
//Parse & to not break curl client. See issue #23
$query_string = str_replace('&', '&', $query_string);
}
if ($this->ignore_arguments) {
$query_string = '';
}
if (strpos($href, '?') !== false) {
echo "EFEASDEFSED";
}
if ((substr($href, 0, 7) != "http://") && (substr($href, 0, 8) != "https://")) {
// Link does not call (potentially) external page
if (strpos($href, ":")) {
$this->logger("URL is an invalid protocol", 1);
return false;
}
if ($href == '/') {
$this->logger("$href is domain root", 2);
$href = $this->real_site;
} elseif (substr($href, 0, 1) == '/') {
$this->logger("$href is relative to root, convert to absolute", 2);
$href = $this->domain_root($this->real_site) . substr($href, 1);
} else {
$this->logger("$href is relative, convert to absolute", 2);
$href = $this->get_path($parent_url) . $href;
}
}
$this->logger("Result: $href", 2);
if (!filter_var($href, FILTER_VALIDATE_URL)) {
$this->logger("URL is not valid. Rejecting.", 1);
return false;
}
if (substr($href, 0, strlen($this->real_site)) != $this->real_site) {
$this->logger("URL is not part of the target domain. Rejecting.", 1);
return false;
}
if ($this->is_scanned($href . ($query_string ? '?' . $query_string : ''))) {
$this->logger("URL has already been scanned. Rejecting.", 1);
return false;
}
if (!$this->check_blacklist($href)) {
$this->logger("URL is blacklisted. Rejecting.", 1);
return false;
}
return $this->flatten_url($href . ($query_string ? '?' . $query_string : ''));
}, $matches[2]);
return $found;
}
}
$this->logger("Found nothing", 2);
return array();
}
public function scan_url($url)
{
$this->depth++;
$this->logger("Scanning $url", 2);
if ($this->is_scanned($url)) {
$this->logger("URL has already been scanned. Rejecting. ", 1);
return $this->depth--;
}
if (substr($url, 0, strlen($this->real_site)) != $this->real_site) {
$this->logger("URL is not part of the target domain. Rejecting.", 1);
return $this->depth--;
}
if (!($this->depth <= $this->max_depth || $this->max_depth == 0)) {
$this->logger("Maximum depth exceeded. Rejecting.", 1);
return $this->depth--;
}
//Note that URL has been scanned
$this->scanned[$url] = 1;
//Send cURL request
list($html, $this->modified, $is_image) = $this->get_data($url);
if ($is_image) {
//Url is an image
}
if (!$html) {
$this->logger("Invalid Document. Rejecting.", 1);
return $this->depth--;
}
if (strpos($url, "&") && strpos($url, ";") === false) {
$url = str_replace("&", "&", $url);
}
$map_row = "<url>n";
$map_row .= "<loc>$url</loc>n";
if ($this->enable_frequency) {
$map_row .= "<changefreq>$this->freq</changefreq>n";
}
if ($this->enable_priority) {
$map_row .= "<priority>$this->priority</priority>n";
}
$map_row .= " <lastmod>".date('Y-m-dTH:i:sP', time())."</lastmod>n";
$map_row .= "</url>n";
fwrite($this->file_stream, $map_row);
$this->indexed++;
$this->logger($url, 0);
unset($is_image, $map_row);
// Extract urls from <a href="??"></a>
$ahrefs = $this->get_links($html, $url, "<as[^>]*href=("|'??)([^" >]*?)\1[^>]*>(.*)</a>");
// Extract urls from <frame src="??">
$framesrc = $this->get_links($html, $url, "<frames[^>]*src=("|'??)([^" >]*?)\1[^>]*>");
$deferredLinks = $this->deferredLinks;
$links = array_filter(array_merge($ahrefs, $framesrc), function ($item) use ($deferredLinks) {
return $item && !isset($this->deferredLinks[$item]);
});
unset($html, $url, $ahrefs, $framesrc);
$this->logger("Found urls: " . join(", ", $links), 2);
//Note that URL has been deferred
foreach ($links as $href) {
if ($href) {
$this->deferredLinks[$href] = 1;
}
}
foreach ($links as $href) {
if ($href) {
$this->scan_url($href);
}
}
$this->depth--;
}
public function fnmatch($pattern, $string)
{
return preg_match("#^" . strtr(preg_quote($pattern, '#'), array('*' => '.*', '?' => '.')) . "$#i", $string);
}
}
![]() |
Notes is a web-based application for online taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000+ notes created and continuing...
With notes.io;
- * You can take a note from anywhere and any device with internet connection.
- * You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
- * You can quickly share your contents without website, blog and e-mail.
- * You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
- * Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.
Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.
Easy: Notes.io doesn’t require installation. Just write and share note!
Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )
Free: Notes.io works for 14 years and has been free since the day it was started.
You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;
Email: [email protected]
Twitter: http://twitter.com/notesio
Instagram: http://instagram.com/notes.io
Facebook: http://facebook.com/notesio
Regards;
Notes.io Team