火车头数据采集导入为可发布数据格式-需配合开发的kiss系统使用

作者: admin 分类: php 发布时间: 2017-05-19 08:59

最近因为一些版权问题火车头v7程序经常闪退,采集需要迁移到v9版本上来,趁着空闲时间写了一个小程序。导出的格式还是和v7一样可以直接发布。

<?php

date_default_timezone_set("Asia/Shanghai");
$file="SpiderResult.mdb";
$split="[#]";
$num=3000;//各个数据库用到的数据个数
for($m=$num;$m<=1*$num;$m=$m+$num){
$db2=$m/$num;
$dbname=$db2."db";
//$dbname="123db";
$fromTime=strtotime("2011/3/20 00:00:00");
$toTime=strtotime("2017/10/20 23:59:59");
//配置使用哪些,这样,可以把一个数据库,生成多个DB
$idstart=$m-$num;
$where="and Id> ".$idstart." and Id<=".$m;
//$where="and Id> 20000 and Id<=41000";
// $con = odbc_connect("Driver={Microsoft Access Driver (*.mdb)};Dbq={$file}", "", "") or die("can not connect");
// $data = odbc_exec($con, "select count(Id) as count from Content where 内容 is not null");
@header("Content-Type: text/html; charset=UTF-8");
set_time_limit(0);
ini_set('memory_limit', '1024M');
$db=getDB();
$dbc=new SQLite3(realpath("SpiderResult.db3"),SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE);
$query=$dbc->query("select count(Id) as count from Content where 内容 is not null {$where}");
$data=$query->fetchArray();
$count= intval($data[0]);
if($count==0||$toTime-$fromTime==0)die("没有找到文章.活着发布开始时间和结束时间相同.");
$step=intval(($toTime-$fromTime)/$count);
echo "一共发现<b>{$count}</b>篇文章.根据设定,从<b>".date("Y/m/d H:i:s",$fromTime)."</b>-<b>".date("Y/m/d H:i:s",$toTime)."</b>发布完,每个{$step}秒发布一篇.<br>开始转换成DB格式";
flush();
//$dbc->close();
$query=$dbc->query("select ID as id,标题 as title,内容 as content from Content where 内容 is not null and 标题 is not null {$where} ORDER BY RANDOM()");
$db->exec("begin exclusive transaction");
$i=0;
while ($row=$query->fetchArray(SQLITE3_ASSOC))
{
	try{
		//TODO 在这里处理得到的每一篇文章.如翻译等.
		//这里只是一个测试。具体生成tag到算法需要按语种选择,没有tag是生成不了相关文章的
		//print_r($row);
		$title=trim($row["title"]);
		$content=trim($row["content"]);
		$id=$row["id"];
		$description=trim(str_ireplace(array("~" ,"!" ,"@" ,"#" ,"$" ,"%" ,"^" ,"+","&" ,"*" ,"," ,"." ,"?" ,";",":" ,'\'','"' ,"[" ,"]" ,"{" ,"}" ,"!" ,"¥" ,"……" ,"…" ,"、" ,",","。" ,"?" ,";" ,":","'","“" ,"”" ,"'" ,"【" ,"】" ,"~" ,"!" ,"@" , "#" ,"$" ,"%" ,"^" ,"&" ,"*" ,"," ,"." ,"<" ,">" ,";" ,":","'",""" ,"[" ,"]" ,"{" ,"}" ,"/" ,"\" ,"(" ,")" ,"(" ,")","《","》", '$','¿','×',$split)," ",substr(strip_tags($content),0,126)));
		
		//$content = title_do($title,$content,$split);
		$s=explode($split,$content);
		if(count($s)<10) continue;
		$content="";
		$imgids="";
		shuffle($s); 
		for($j=0;$j<count($s);$j++){
			$s[$j]=str_replace(array("h2>","span>","... "),array("h3>","p>",""),$s[$j]);
			$imgids.=",".rand(1,706);
			// if(empty($s[$j])||strlen($s[$j])<150){
			// 	unset($s[$j]);

			// }
		}
		$imgids=substr($imgids, 1);
		$content=implode($split,$s);

		if(empty($title)||empty($content))
		{
			
			continue;
		}

		$tag=getTags($title.strip_tags($description));
		$title=$db->escapeString(trim($title));

		$content=$db->escapeString($content);
		$tag=$db->escapeString($tag);
		$postname=$db->escapeString(clearPoint($title));
		//$postname=$db->escapeString(clearPoint($i));
		
		$sql="insert into post values ('{$title}','{$tag}','{$description}','{$content}','{$imgids}','{$title}',".($fromTime+($i++)*$step+mt_rand(0,$step)).",'{$postname}')";
		@$db->exec($sql);
		
		if($i%500==0)
		{
			$db->exec("end transaction");
			
			$db->exec("begin exclusive transaction");
			echo date("Y/m/d H:i:s")."&nbsp;&nbsp;".$i."&nbsp;:成功转换:{$title}<br>\n";
			ob_flush();flush();
		}
		
	}
	catch(Exception $e)
	{
		echo $e;
	}
}
echo "{$i}&nbsp;转换完成<br>";
$db->exec("end transaction");
echo "更新tags<br/>";
updateTags();
echo "完成更新tags<br/>";
$db->close();
$data->close();
}

/**按空格区分的语种,比如英文/俄语等
 * @param unknown_type $title
* @return string
*/

function getTags($title)
{
	$tag="";
	$tags=explode(" ",$title);
	foreach($tags as $t)
	{
		$t=trim($t);
		if(3<strlen($t))
			$tag.=",".$t;
	}
	if($tag)$tag=substr($tag,1);
	return $tag;
}

function title_do($title,$content,$fengefu)
{
	$s=explode($fengefu,$content);
	$bh = array();
	$mbh = array();
	for($x=0;$x<count($s);$x++)
	{
		if(-1<strpos(str_ireplace(array("  "," "),"",strtolower($s[$x])),str_ireplace(array("  "," "),"",strtolower($title))))
		{
			$bh[]=$s[$x];
		}
		else
		{
			$mbh[]=$s[$x];
		}
	}

	if(count($bh)!=0)
	{
		$tmp = rand(2,3);//留下2-3条与标题重复的信息
		if($tmp<count($bh))
		{
			@$rand_bh=array_rand($bh,$tmp);
	
			foreach($rand_bh as $e)
			{
				$t[] = $bh[$e];
			}
			
			@$arrmer=array_merge($t,$mbh);

			shuffle($arrmer);
			$content = implode($fengefu,$arrmer);
		}
	}
	
	return $content;
}

		
//创建一个空的DB
function getDB()
{
	global $dbname;
	if(file_exists($dbname))echo "<h2>{$dbname}存在,做追加操作。</h2>";
	$db=new SQLite3($dbname,SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE);
	$db->exec("CREATE TABLE IF NOT EXISTS post (title varchar(256),tag varchar(256),description varchar(512),content TEXT, imgids varchar(512),comment TEXT,posttime INTEGER,postname varchar(512))") && $db->exec("CREATE TABLE if not exists tag (k VARCHAR(128) PRIMARY KEY,v TEXT,count integer)") && $db->exec("create index if not exists indexcount on tag (count desc) ") && $db->exec("create index if not exists indextime on post (posttime desc)") && $db->exec("create UNIQUE index if not exists indexpostname on post (postname)");
	return $db;
}
function getData($sql){
	global $db;
	$result=$db->query($sql) or die("Error:".$sql);
	$ret=array();
	while($row=$result->fetchArray(SQLITE3_ASSOC))$ret[]=$row;
	unset($result);
	unset($row);
	return $ret;
}

function checkTables()
{
	global $db;
	return $db->exec("CREATE TABLE IF NOT EXISTS post (title varchar(256),tag varchar(256),description varchar(512),content TEXT,imgids varchar(512),comment TEXT,posttime INTEGER)") && $db->exec("CREATE TABLE if not exists tag (k VARCHAR(128) PRIMARY KEY,v TEXT,count integer)") && $db->exec("create index if not exists indexcount on tag (count desc) ") && $db->exec("create index if not exists indextime on post (posttime desc)");
}

function updateTags()
{
	$tags=array();
	$sql="select rowid,tag from post";
	$data=getData($sql);
	foreach($data as $d)
	{
		$postTags=explode(",",$d["tag"]);
		foreach($postTags as $pt)
		{
			if(isset($tags[$pt]))$tags[$pt]=$tags[$pt].",".$d["rowid"];
			else $tags[$pt]=$d["rowid"];
		}
	}
	if(!empty($tags))
	{
		global $db;
		$db->exec("drop table tag");
		checkTables();
		$db->exec("begin exclusive transaction");
		foreach($tags as $k=>$v)
		{
			$sql="insert into tag(k,v,count) values ('".$db->escapeString($k)."','{$v}',".(substr_count($v,",")+1).")";
			$db->exec($sql);
		}
		$db->exec("end transaction");
	}
	echo count($tags)." tags updated";
}

function clearPoint($kw)
{ 
    $kw= str_replace(array("~" ,"!" ,"@" ,"#" ,"$" ,"%" ,"^" ,"+","&" ,"*" ,"," ,"." ,"?" ,";",":" ,'\'','"' ,"[" ,"]" ,"{" ,"}" ,"!" ,"¥" ,"……" ,"…" ,"、" ,"," ,"。" ,"?" ,";" ,":","'","“" ,"”" ,"'" ,"【" ,"】" ,"~" ,"!" ,"@" ,"#" ,"$" ,"%" ,"^" ,"&" ,"*" ,"," ,"." ,"<" ,">" ,";" ,":","'",""" ,"[" ,"]" ,"{" ,"}","/" ,"\" ,"(" ,")" ,"(" ,")","《","》", '$','¿','×'),'', $kw ); 
		
    $kw= str_replace( array("  ","-","_","\\","/"),' ', $kw ); 
	
    $kw= str_replace(array("á","í","é","ó","ú","ñ","Á","Í","É","Ó","Ú","Ñ","ç","ã","à","â","ê","ô","õ","ü"),array("a","i","e","o","u","n","a","i","e","o","u","n","c","a","a","a","e","o","o","u"),$kw);
	
    $kw= str_replace(array("а","б","в","г","д","е","ё","ж","з","и","й","к","л","м","н","о","п","р","с","т","у","ф","х","ц","ч","ш","я","ю","щ","щ","э","ъ","ь","А","Б","В","Г","Д","Е","Ё","Э","Ж","З","И","Й","К","Л","М","Н","О","П","Р","С","Т","У","Ф","Х","Ц","Ч","Ш","Щ","Ы","Ю","Я","ы"),array("a","b","v","g","d","e","e","zh","z","i","j","k","l","m","n","o","p","r","s","t","u","f","x","c","ch","s","ya","yu","sch","y","e","","","A","B","V","G","D","E","E","E","J","Z","I","I","K","L","M","N","O","P","R","S","T","U","F","H","C","CH","SH","SH","Y","YU","YA","s"),$kw);
    
	$kw = strtolower(strip_tags(trim($kw)));    
	
	$kw = explode(' ',$kw);
	
	$kw = implode('-',array_filter($kw));
	
	return $kw;
} 

?>

 

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注