Как парсить страницы сайтов с автоподгрузкой на примере instagram

🚀 Usage

  • BigQuery
  • DB2
  • Hive
  • MariaDB
  • Mysql
  • PostgresQL
  • TransactSQL
  • New issue could be made for other new database.
const{Parser}=require('node-sql-parser');constparser=newParser();constast=parser.astify('SELECT * FROM t');console.log(ast);

ast for SELECT * FROM t

{"with"null,"type""select","options"null,"distinct"null,"columns""*","from"{"db"null,"table""t","as"null},"where"null,"groupby"null,"having"null,"orderby"null,"limit"null}
constopt={  database'MySQL'}const{Parser}=require('node-sql-parser/build/mysql');constparser=newParser()constast=parser.astify('SELECT * FROM t', opt);constsql=parse.sqlify(ast, opt);console.log(sql);
constopt={  database'MariaDB'}const{Parser}=require('node-sql-parser/build/mariadb');constparser=newParser()const{tableList,columnList,ast}=parser.parse('SELECT * FROM t', opt);
  • get the table list that the sql visited
  • the format is {type}::{dbName}::{tableName} // type could be select, update, delete or insert
constopt={  database'MySQL'}const{Parser}=require('node-sql-parser/build/mysql');constparser=newParser();consttableList=parser.tableList('SELECT * FROM t', opt);console.log(tableList);
  • get the column list that the sql visited
  • the format is {type}::{tableName}::{columnName} // type could be select, update, delete or insert
  • for , and without specified columns, the column authority regex is required
constopt={  database'MySQL'}const{Parser}=require('node-sql-parser/build/mysql');constparser=newParser();constcolumnList=parser.columnList('SELECT t.id FROM t', opt);console.log(columnList);
  • check table authority
  • function check on mode and database by default
const{Parser}=require('node-sql-parser');constparser=newParser();constsql='UPDATE a SET id = 1 WHERE name IN (SELECT name FROM b)'constwhiteTableList='(select|update)::(.*)::(a|b)'constopt={  database'MySQL',  type'table',}parser.whiteListCheck(sql, whiteTableList, opt)

check column authority


const{Parser}=require('node-sql-parser');constparser=newParser();constsql='UPDATE a SET id = 1 WHERE name IN (SELECT name FROM b)'constwhiteColumnList='select::null::name','update::a::id'constopt={  database'MySQL',  type'column',}parser.whiteListCheck(sql, whiteColumnList, opt)

🎉 Install

npm install node-sql-parser --saveoryarn add node-sql-parser
npm install @taozhi8833998/node-sql-parser --registry=https://npm.pkg.github.com/

Import the JS file in your page:

<script src="https://unpkg.com/node-sql-parser/umd/index.umd.js"><script><script src="https://unpkg.com/node-sql-parser/umd/mysql.umd.js"><script><script src="https://unpkg.com/node-sql-parser/umd/postgresql.umd.js"><script>

NodeSQLParser object is on window

<!DOCTYPE html><htmllang="en"><head><title>node-sql-parser</title><metacharset="utf-8" /></head><body><p><em>Check console to see the output</em></p><scriptsrc="https://unpkg.com/node-sql-parser/umd/mysql.umd.js"></script><script>window.onload=function(){constparser=newNodeSQLParser.Parser()constast=parser.astify("select id, name from students where age < 18")console.log(ast)constsql=parser.sqlify(ast)console.log(sql)}</script></body></html>

DefaultHandler Options

var handler =newhtmlparser.DefaultHandler(function(error){...},{ verbosefalse, ignoreWhitespacetrue});

Indicates whether the DOM should exclude text nodes that consists solely of whitespace. The default value is «false».

The following HTML:

<font><br>this is the text<font>

becomes:

{ raw'font', data'font', type'tag', name'font', children{ raw'br', data'br', type'tag', name'br'},{ raw'this is the text\n', data'this is the text\n', type'text'},{ raw'font', data'font', type'tag', name'font'}}

The following HTML:

<font><br>this is the text<font>

becomes:

{ raw'font', data'font', type'tag', name'font', children{ raw'\n\t', data'\n\t', type'text'},{ raw'br', data'br', type'tag', name'br'},{ raw'this is the text\n', data'this is the text\n', type'text'},{ raw'font', data'font', type'tag', name'font'}}

Indicates whether to include extra information on each node in the DOM. This information consists of the «raw» attribute (original, unparsed text found between «<» and «>») and the «data» attribute on «tag», «script», and «comment» nodes. The default value is «true».

The following HTML:

<ahref="test.html">xxx</a>

becomes:

{ raw'a href="test.html"', data'a href="test.html"', type'tag', name'a', attribs{ href'test.html'}, children{ raw'xxx', data'xxx', type'text'}}

The following HTML:

<a href="test.html">xxx<a>

becomes:

{ type'tag', name'a', attribs{ href'test.html'}, children{ data'xxx', type'text'}}

Indicates whether the DOM should prevent children on tags marked as empty in the HTML spec. Typically this should be set to «true» HTML parsing and «false» for XML parsing. The default value is «true».

The following HTML:

<link>text</link>

becomes:

{ raw'link', data'link', type'tag', name'link'},{ raw'text', data'text', type'text'}

The following HTML:

<link>text</link>

becomes:

{ raw'link', data'link', type'tag', name'link', children{ raw'text', data'text', type'text'}}

Examples

This example demonstrates adding a generic JSON and URL-encoded parser as a top-level middleware, which will parse the bodies of all incoming requests. This is the simplest setup.

var express =require('express')var bodyParser =require('body-parser')var app =express()app.use(bodyParser.urlencoded({ extendedfalse}))app.use(bodyParser.json())app.use(function(req,res){res.setHeader('Content-Type','text/plain')res.write('you posted:\n')res.end(JSON.stringify(req.body,null,2))})

This example demonstrates adding body parsers specifically to the routes that need them. In general, this is the most recommended way to use body-parser with Express.

var express =require('express')var bodyParser =require('body-parser')var app =express()var jsonParser =bodyParser.json()var urlencodedParser =bodyParser.urlencoded({ extendedfalse})app.post('/login', urlencodedParser,function(req,res){res.send('welcome, '+req.body.username)})app.post('/api/users', jsonParser,function(req,res){})

All the parsers accept a option which allows you to change the that the middleware will parse.

var express =require('express')var bodyParser =require('body-parser')var app =express()app.use(bodyParser.json({ type'application/*+json'}))app.use(bodyParser.raw({ type'application/vnd.custom-type'}))app.use(bodyParser.text({ type'text/html'}))

License

(The MIT License)


Copyright (c) 2016 xml2json AUTHORS

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the «Software»), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED «AS IS», WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Parser

Parsing is very simple.

Just call the method of the xml-parse instance.

constxml=require("xml-parse");var parsedXML =xml.parse('<?xml version="1.0" encoding="UTF-8"?>'+'<root>Root Element</root>');console.log(parsedXML);var parsedInavlidXML =xml.parse('<root></root>'+'<secondRoot>'+'<notClosedTag>'+'</secondRoot>');console.log(parsedInavlidXML);

The result of is an object that maybe looks like this:

(In this case we have the xml string of the given example)

{    type'element',    tagName'?xml',    attributes{      version'1.0',      encoding'UTF-8'},    childNodes,    innerXML'>',    closingfalse,    closingChar'?'},{    type'element',    tagName'root',    attributes{},    childNodes{        type'text',        text'Root Element'},    innerXML'Root Element',    closingtrue,    closingCharnull}

The root object is always an array because of the fact that it handles invalid xml with more than one root element.

There are two kinds of objects. element and text. An object has always the property . The other keys depend from this type.

{  typeString,  tagNameString,  attributesObject,  childNodesArray,  innerXMLString,  closingBoolean,  closingCharString||null}
{  typeString,  textString}

Processing attribute, tag names and values

You can optionally provide the parser with attribute name and tag name processors as well as element value processors:

functionnameToUpperCase(name){returnname.toUpperCase();}parseString(xml,{  tagNameProcessorsnameToUpperCase,  attrNameProcessorsnameToUpperCase,  valueProcessorsnameToUpperCase,  attrValueProcessorsnameToUpperCase},(err,result)=>{});

The , , and options accept an of functions with the following signature:

function(name){

Some processors are provided out-of-the-box and can be found in code:

  • : transforms the name to lowercase. (Automatically used when option is set to )

  • : transforms the first character to lower case. E.g. ‘MyTagName’ becomes ‘myTagName’

  • : strips the xml namespace prefix. E.g will become ‘Bar’. (N.B.: the prefix is NOT stripped.)

  • : parses integer-like strings as integers and float-like strings as floats E.g. «0» becomes 0 and «15.56» becomes 15.56

  • : parses boolean-like strings to booleans E.g. «true» becomes true and «False» becomes false

API

Returns:

Type:

As an alternative to passing an object, you may pass an which specifies the headers to use. For example:

csv('Name','Age');

If you need to specify options and headers, please use the the object notation with the property as shown below.

Type: Default:

A single-character string used to specify the character used to escape strings in a CSV row.

Type:

Specifies the headers to use. Headers define the property key for each value in a CSV row. If no option is provided, will use the first line in a CSV file as the header specification.

If , specifies that the first row in a data file does not contain headers, and instructs the parser to use the column index as the key for each column. Using with the same example from above would yield:

{'''Daffy Duck','1'24},{'''Bugs Bunny','1'22}

Note: If using the for an operation on a file which contains headers on the first line, specify to skip over the row, or the headers row will appear as normal row data. Alternatively, use the option to manipulate existing headers in that scenario.

Type:

A function that can be used to modify the values of each header. Return a to modify the header. Return to remove the header, and it’s column, from the results.

csv({mapHeaders({ header, index })=>header.toLowerCase()})

header String The current column header.index Number The current column index.

Type:

A function that can be used to modify the content of each column. The return value will replace the current column content.

csv({mapValues({ header, index, value })=>value.toLowerCase()})

header String The current column header.index Number The current column index.value String The current column value (or content).

Type: Default:

Specifies a single-character string to denote the end of a line in a CSV file.

Type: Default:


Specifies a single-character string to denote a quoted string.

Type:

If , instructs the parser not to decode UTF-8 strings.

Type: Default:

Specifies a single-character string to use as the column separator for each row.

Type: Default:

Instructs the parser to ignore lines which represent comments in a CSV file. Since there is no specification that dictates what a CSV comment looks like, comments should be considered non-standard. The «most common» character used to signify a comment in a CSV file is . If this option is set to , lines which begin with will be skipped. If a custom character is needed to denote a commented line, this option may be set to a string which represents the leading character(s) signifying a comment line.

Type: Default:

Specifies the number of lines at the beginning of a data file that the parser should skip over, prior to parsing headers.

Type: Default:

Maximum number of bytes per row. An error is thrown if a line exeeds this value. The default value is on 8 peta byte.

Type:

If , instructs the parser that the number of columns in each row must match the number of specified.

Usage

You can parse RSS from a URL () or an XML string ().

Both callbacks and Promises are supported.

Here’s an example in NodeJS using Promises with async/await:

let Parser =require('rss-parser');let parser =newParser();(async()=>{let feed =awaitparser.parseURL('https://www.reddit.com/.rss');console.log(feed.title);feed.items.forEach(item=>{console.log(item.title+''+item.link)});})();

Here’s an example in the browser using callbacks:

<scriptsrc="/node_modules/rss-parser/dist/rss-parser.min.js"><script><script>constCORS_PROXY="https://cors-anywhere.herokuapp.com/"let parser =newRSSParser();parser.parseURL(CORS_PROXY+'https://www.reddit.com/.rss',function(err,feed){if(err)throw err;console.log(feed.title);feed.items.forEach(function(entry){console.log(entry.title+''+entry.link);})})</script>

A few minor breaking changes were made in v3. Here’s what you need to know:

  • You need to construct a before calling or
  • is no longer available (for better browser support)
  • are now passed to the Parser constructor
  • is now just (top-level object removed)
  • is now (to better match RSS XML)

API

parser.toJson(xml, options);
parser.toXml(json);

Default values:

var options ={    objectfalse,    reversiblefalse,    coercefalse,    sanitizetrue,    trimtrue,    arrayNotationfalse    alternateTextNodefalse};
  • object: Returns a Javascript object instead of a JSON string
  • reversible: Makes the JSON reversible to XML (*)
  • coerce: Makes type coercion. i.e.: numbers and booleans present in attributes and element values are converted from string to its correspondent data types. Coerce can be optionally defined as an object with specific methods of coercion based on attribute name or tag name, with fallback to default coercion.
  • trim: Removes leading and trailing whitespaces as well as line terminators in element values.
  • arrayNotation: XML child nodes are always treated as arrays NB: you can specify a selective array of nodes for this to apply to instead of the whole document.
  • sanitize: Sanitizes the following characters present in element values:
var chars ={'<''&lt;','>''&gt;','(''&#40;',')''&#41;','#''&#35;','&''&amp;','"''&quot;',"'"'&apos;'};

alternateTextNode: Changes the default textNode property from $t to _t when option is set to true. Alternatively a string can be specified which will override $t to what ever the string is.

Default values:

var options ={    sanitizefalse,    ignoreNullfalse};
  • is the default option to behave like previous versions
  • ignoreNull: Ignores all null values

(*) xml2json tranforms CDATA content to JSON, but it doesn’t generate a reversible structure.

Using jQuery/Zepto ($.ua)

Although written in vanilla js (which means it doesn’t depends on jQuery), this library will automatically detect if jQuery/Zepto is present and create object based on browser’s user-agent (although in case you need, constructor is still present). To get/set user-agent you can use: / .

console.log($.ua.device);console.log($.ua.os);console.log($.ua.os.name);console.log($.ua.get());$.ua.set('Mozilla/5.0 (Linux; U; Android 3.0.1; en-us; Xoom Build/HWI69) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13');console.log($.ua.browser.name);console.log($.ua.engine.name);console.log($.ua.device);console.log(parseInt($.ua.browser.version.split('.'),10));$('body').addClass('ua-browser-'+$.ua.browser.name+' ua-devicetype-'+$.ua.device.type);

Shoot-and-forget usage

You want to parse XML as simple and easy as possible? It’s dangerous to go alone, take this:

var parseString =require('xml2js').parseString;var xml ="<root>Hello xml2js!</root>"parseString(xml,function(err,result){console.dir(result);});

Can’t get easier than this, right? This works starting with 0.2.3. With CoffeeScript it looks like this:

{parseString} =require'xml2js'xml ="<root>Hello xml2js!</root>"parseString xml, (err, result)->console.dir result

If you need some special options, fear not, supports a number of options (see below), you can specify these as second argument:

parseString(xml,{trimtrue},function(err,result){});

Ссылки […]()

Ссылки можно задавать вместо вот так:

(http://wikipedia.org)

Для показа ссылки без особого заголовка:

[http://wikipedia.org]()
-- станет
<a href="http://wikipedia.org">http://wikipedia.org</a>

Для ссылки на статью или задачу с сайта можно использовать только её абсолютный URL, заголовок подставится автоматически, например:

Читайте об этом в главе [](/events)
-- станет (из базы будет получен заголовок)
Читайте об этом в главе <a href="/events">События</a> 

Для того, чтобы сослаться на заголовок, у которого есть :

(#instanceof)
-- станет (если есть статья с заголовокм )
<a href="/url-этой-статьи#instanceof">Оператор <code>instanceof</code></a>

Output

feedUrl'https://www.reddit.com/.rss'title'reddit: the front page of the internet'description""link'https://www.reddit.com/'items- title'The water is too deep, so he improvises'link'https://www.reddit.com/r/funny/comments/3skxqc/the_water_is_too_deep_so_he_improvises/'pubDate'Thu, 12 Nov 2015 21:16:39 +0000'creator"John Doe"content'<a href="http://example.com">this is a link</a> &amp; <b>this is bold text</b>'contentSnippet'this is a link & this is bold text'guid'https://www.reddit.com/r/funny/comments/3skxqc/the_water_is_too_deep_so_he_improvises/'categories-funnyisoDate'2015-11-12T21:16:39.000Z'
  • The field strips out HTML tags and unescapes HTML entities
  • The prefix will be removed from all fields
  • Both and will be available in ISO 8601 format as
  • Atom’s becomes for consistency

Usage

All examples assume that this library is bootstrapped using:

'use strict';var Url =require('url-parse');

To parse an URL simply call the method with the URL that needs to be transformed into an object.

var url =newUrl('https://github.com/foo/bar');

The keyword is optional but it will save you an extra function invocation. The constructor takes the following arguments:

  • (): A string representing an absolute or relative URL.
  • ( | ): This argument is optional and specifies how to parse the query string. By default it is so the query string is not parsed. If you pass the query string is parsed using the embedded module. If you pass a function the query string will be parsed using this function.

As said above we also support the Node.js interface so you can also use the library in this way:

'use strict';var parse =require('url-parse'), url =parse('https://github.com/foo/bar',true);

The returned instance contains the following properties:

  • : The protocol scheme of the URL (e.g. ).
  • : A boolean which indicates whether the is followed by two forward slashes ().
  • : Authentication information portion (e.g. ).
  • : Username of basic authentication.
  • : Password of basic authentication.
  • : Host name with port number.
  • : Host name without port number.
  • : Optional port number.
  • : URL path.
  • : Parsed object containing query string, unless parsing is set to false.
  • : The «fragment» portion of the URL including the pound-sign ().
  • : The full URL.
  • : The origin of the URL.

Note that when is used in a browser environment, it will default to using the browser’s current window location as the base URL when parsing all inputs. To parse an input independently of the browser’s current URL (e.g. for functionality parity with the library in a Node environment), pass an empty location object as the second parameter:

var parse =require('url-parse');parse('hostname',{});

A simple helper function to change parts of the URL and propagating it through all properties. When you set a new you want the same value to be applied to if has a different port number, so it has a correct name again and so you have a complete URL.

var parsed =parse('http://google.com/parse-things');parsed.set('hostname','yahoo.com');console.log(parsed.href);

It’s aware of default ports so you cannot set a port 80 on an URL which has as protocol.

The returned object comes with a custom method which will generate a full URL again when called. The method accepts an extra function which will stringify the query string for you. If you don’t supply a function we will use our default method.

var location =url.toString();

You would rarely need to use this method as the full URL is also available as property. If you are using the method to make changes, this will automatically update.

XML Options

If your RSS feed contains fields that aren’t currently returned, you can access them using the option.

let parser =newParser({  customFields{    feed'otherTitle','extendedDescription',    item'coAuthor','subtitle',}});parser.parseURL('https://www.reddit.com/.rss',function(err,feed){console.log(feed.extendedDescription);feed.items.forEach(function(entry){console.log(entry.coAuthor+''+entry.subtitle);})})

To rename fields, you can pass in an array with two items, in the format :

let parser =newParser({  customFields{    item'dc:coAuthor','coAuthor',}})

To pass additional flags, provide an object as the third array item. Currently there is one such flag:

  • — set to to return all values for fields that can have multiple entries.
  • — set to to add an additional field, , with HTML stripped out
let parser =newParser({  customFields{    item'media:content','media:content',{keepArraytrue},}})

If your RSS Feed doesn’t contain a tag with a attribute, you can pass a option for the Parser to use:

let parser =newParser({  defaultRSS2.});
let parser =newParser({  xml2js{    emptyTag'--EMPTY--',}});

Показ примеров в

Во-первых, заметим, что любой js/html-код можно сделать запускаемым, добавив в начало .

При этом HTML будет при запуске показываться в снизу. Можно даже добавить для автозапуска при загрузке страницы.

ББ-теги, описанные ниже, дают альтернативные способы показа примера.

ББ-тег позволяет показать пример в действии в ‘е с минимальными «декорациями».

Например:

Покажет пример , без кода.

Параметры:

  • — высота (если автовычисленная не подходит)
  • — добавить в ифрейм ссылку для открытия в новом окне
  • — добавить в ифрейм ссылку для открытия в песочнице
  • — добавить в ифрейм ссылку на скачивание архива с примером

Обычно чистый используется для показа «как работает» пример без возможности залезть в код, например в качестве демки для задачи.

Если пример содержит несколько важных файлов — его можно показать через .

Это то же самое, что , но дополнительно над ‘ом будет лента с табами файлов примера. Любой файл можно выбрать и посмотреть.

Например:


С этим читают