亚洲欧美精品一区二区,久久伊人影院,狠狠久久亚洲欧美

當前位置：首頁 > 范文|應用文 > IT技術專欄 > 網絡編程

php提取網頁正文內容的例子

來源：易賢網閱讀：1166 次日期：2014-12-10 13:52:38

溫馨提示：易賢網小編為您整理了“php提取網頁正文內容的例子”,方便廣大網友查閱！

因為難點在于如何去識別并保留網頁中的文章部分，而且刪除其它無用的信息，并且要做到通用化，不能像火車頭那樣根據目標站來制定采集規則，因為搜索引擎結果中有各種的網頁。

抓回一個頁面的數據，如何匹配出正文部分，鄭曉在下班路上想了個思路是：

1. 提取出body標簽部分–>剔除所有鏈接–>剔除所有script、注釋–>剔除所有空白標簽(包括標簽內不含中文的)–>獲取結果。

2. 直接匹配出非鏈接的、符合在div、p、h標簽中的中文部分???

還是會有不少其它多余信息啊，比如底部信息等。。如何搞?不知道大家有木有什么思路或建議?

這個類是從網上找到的一個php實現的提取網頁正文部分的算法，鄭曉在本地也測試了下，準確率非常高。

代碼如下

<?php

class Readability {

// 保存判定結果的標記位名稱

const ATTR_CONTENT_SCORE = "contentScore";

// DOM 解析類目前只支持 UTF-8 編碼

const DOM_DEFAULT_CHARSET = "utf-8";

// 當判定失敗時顯示的內容

const MESSAGE_CAN_NOT_GET = "Readability was unable to parse this page for content.";

// DOM 解析類（PHP5 已內置）

protected $DOM = null;

// 需要解析的源代碼

protected $source = "";

// 章節的父元素列表

private $parentNodes = array();

// 需要刪除的標簽

// Note: added extra tags from

private $junkTags = Array("style", "form", "iframe", "script", "button", "input", "textarea",

"noscript", "select", "option", "object", "applet", "basefont",

"bgsound", "blink", "canvas", "command", "menu", "nav", "datalist",

"embed", "frame", "frameset", "keygen", "label", "marquee", "link");

// 需要刪除的屬性

private $junkAttrs = Array("style", "class", "onclick", "onmouseover", "align", "border", "margin");

/**

* 構造函數

* @param $input_char 字符串的編碼。默認 utf-8，可以省略

function __construct($source, $input_char = "utf-8") {

$this->source = $source;

// DOM 解析類只能處理 UTF-8 格式的字符

$source = mb_convert_encoding($source, 'HTML-ENTITIES', $input_char);

// 預處理 HTML 標簽，剔除冗余的標簽等

$source = $this->preparSource($source);

// 生成 DOM 解析類

$this->DOM = new DOMDocument('1.0', $input_char);

try {

//libxml_use_internal_errors(true);

// 會有些錯誤信息，不過不要緊 :^)

if (encoding="'.Readability::DOM_DEFAULT_CHARSET.'">'.$source)) {

throw new Exception("Parse HTML Error!");

}

foreach ($this->DOM->childNodes as $item) {

if ($item->nodeType == XML_PI_NODE) {

$this->DOM->removeChild($item); // remove hack

}

// insert proper

$this->DOM->encoding = Readability::DOM_DEFAULT_CHARSET;

} catch (Exception $e) {

// ...

}

/**

* 預處理 HTML 標簽，使其能夠準確被 DOM 解析類處理

* @return String

private function preparSource($string) {

// 剔除多余的 HTML 編碼標記，避免解析出錯

preg_match("/charset=([＼w|＼-]+);?/", $string, $match);

if (isset($match[1])) {

$string = preg_replace("/charset=([＼w|＼-]+);?/", "", $string, 1);

}

// Replace all doubled-up <BR> tags with <P> tags, and remove fonts.

$string = preg_replace("/<br＼/?>[ ＼r＼n＼s]*<br＼/?>/i", "</p><p>", $string);

$string = preg_replace("/<＼/?font[^>]*>/i", "", $string);

// @see

// - from

$string = preg_replace("#<script(.*?)>(.*?)</script>#is", "", $string);

return trim($string);

}

/**

* 刪除 DOM 元素中所有的 $TagName 標簽

* @return DOMDocument

private function removeJunkTag($RootNode, $TagName) {

$Tags = $RootNode->getElementsByTagName($TagName);

//Note: always index 0, because removing a tag removes it from the results as well.

while($Tag = $Tags->item(0)){

$parentNode = $Tag->parentNode;

$parentNode->removeChild($Tag);

}

return $RootNode;

}

/**

* 刪除元素中所有不需要的屬性

private function removeJunkAttr($RootNode, $Attr) {

$Tags = $RootNode->getElementsByTagName("*");

$i = 0;

while($Tag = $Tags->item($i++)) {

$Tag->removeAttribute($Attr);

}

return $RootNode;

}

/**

* 根據評分獲取頁面主要內容的盒模型

* 判定算法來自：

* 這里由鄭曉博客轉發

* @return DOMNode

private function getTopBox() {

// 獲得頁面所有的章節

$allParagraphs = $this->DOM->getElementsByTagName("p");

// Study all the paragraphs and find the chunk that has the best score.

// A score is determined by things like: Number of <p>'s, commas, special classes, etc.

$i = 0;

while($paragraph = $allParagraphs->item($i++)) {

$parentNode = $paragraph->parentNode;

$contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));

$className = $parentNode->getAttribute("class");

$id = $parentNode->getAttribute("id");

// Look for a special classname

if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {

$contentScore -= 50;

} else if(preg_match(

$className)) {

$contentScore += 25;

}

// Look for a special ID

if (preg_match("/(comment|meta|footer|footnote)/i", $id)) {

$contentScore -= 50;

} else if (preg_match(

$id)) {

$contentScore += 25;

}

// Add a point for the paragraph found

// Add points for any commas within this paragraph

if (strlen($paragraph->nodeValue) > 10) {

$contentScore += strlen($paragraph->nodeValue);

}

// 保存父元素的判定得分

$parentNode->setAttribute(Readability::ATTR_CONTENT_SCORE, $contentScore);

// 保存章節的父元素，以便下次快速獲取

array_push($this->parentNodes, $parentNode);

}

$topBox = null;

// Assignment from index for performance.

// See

for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) {

$parentNode = $this->parentNodes[$i];

$contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));

$orgContentScore = intval($topBox ? $topBox->getAttribute(Readability::ATTR_CONTENT_SCORE) : 0);

if ($contentScore && $contentScore > $orgContentScore) {

$topBox = $parentNode;

}

// 此時，$topBox 應為已經判定后的頁面內容主元素

return $topBox;

}

/**

* 獲取 HTML 頁面標題

* @return String

public function getTitle() {

$split_point = ' - ';

$titleNodes = $this->DOM->getElementsByTagName("title");

if ($titleNodes->length

&& $titleNode = $titleNodes->item(0)) {

// @see

$title = trim($titleNode->nodeValue);

$result = array_map('strrev', explode($split_point, strrev($title)));

return sizeof($result) > 1 ? array_pop($result) : $title;

}

return null;

}

/**

* Get Leading Image Url

* @return String

public function getLeadImageUrl($node) {

$images = $node->getElementsByTagName("img");

if ($images->length && $leadImage = $images->item(0)) {

return $leadImage->getAttribute("src");

}

return null;

}

/**

* 獲取頁面的主要內容（Readability 以后的內容）

* @return Array

public function getContent() {

if (!$this->DOM) return false;

// 獲取頁面標題

$ContentTitle = $this->getTitle();

// 獲取頁面主內容

$ContentBox = $this->getTopBox();

//Check if we found a suitable top-box.

if($ContentBox === null)

throw new RuntimeException(Readability::MESSAGE_CAN_NOT_GET);

// 復制內容到新的 DOMDocument

$Target = new DOMDocument;

$Target->appendChild($Target->importNode($ContentBox, true));

// 刪除不需要的標簽

foreach ($this->junkTags as $tag) {

$Target = $this->removeJunkTag($Target, $tag);

}

// 刪除不需要的屬性

foreach ($this->junkAttrs as $attr) {

$Target = $this->removeJunkAttr($Target, $attr);

}

$content = mb_convert_encoding($Target->saveHTML(), Readability::DOM_DEFAULT_CHARSET, "HTML-ENTITIES");

// 多個數據，以數組的形式返回

return Array(

'lead_image_url' => $this->getLeadImageUrl($Target),

'word_count' => mb_strlen(strip_tags($content), Readability::DOM_DEFAULT_CHARSET),

'title' => $ContentTitle ? $ContentTitle : null,

'content' => $content

);

}

function __destruct() { }

}

使用起來也非常簡單，實例化時傳入網頁的html源碼和相應的編碼，然后直接調用其getContent方法即可返回提取到的正文部分，提取出的文章中可能還會含有少部分鏈接，可以自己后期再修改

更多信息請查看IT技術專欄

上一篇：PHP官方Windows擴展列表

下一篇：為ckeditor編輯器修改添加一鍵排版功能

易賢網手機網站地址：php提取網頁正文內容的例子

由于各方面情況的不斷調整與變化，易賢網提供的所有考試信息和咨詢回復僅供參考，敬請考生以權威部門公布的正式信息和咨詢為準！

相關閱讀網絡編程

Shell中如何刪除文本比較長的行的實現方法10月30日

vue.js語法及常用指令10月30日

python 讀寫中文json的實例詳解10月30日

Objective-C Json 實例詳解10月30日

bootstrap table sum總數量統計實現方法10月30日

python生成二維碼的實例詳解10月30日

Python批量更改文件名的實現方法10月30日

解決出現Incorrect integer value的問題10月30日

jQuery實現切換隱藏與顯示同時切換圖標功能10月30日

docker python api 安裝配置的詳解10月30日

javascript按鈕禁用和啟用的效果實例代碼10月30日

vue.js todolist實現代碼10月30日

vue.js 父向子組件傳參的實例代碼10月30日

apache 開啟重定向 rewrite的實現方法10月30日

Vue.js劃分組件的方法10月30日

python logging日志模塊的詳解10月30日

vue中的scope使用詳解10月30日

docker cgroup 資源監控的詳解10月30日

使用Android Studio 開發自己的SDK教程10月23日

linux系統下MongoDB單節點安裝教程10月23日

易賢網移動網站

2026上岸·考公考編培訓報班

報班類型
姓名
手機號
驗證碼

中文字幕免费精品_亚洲视频自拍_亚洲综合国产激情另类一区_色综合咪咪久久