欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  后端开发

字符串 - PHP 敏感词违法关键字检测 算法方案

程序员文章站 2022-05-27 16:08:06
...
已有6000条关键字,分3批次。
一批为替换 replace,一批为遇到需要审核 censor,最后一批为遇到就禁止发布banned。
设计数据表如下:
mysql> desc tbl_censor;
+-------------+----------------------+------+-----+---------+----------------+
| Field       | Type                 | Null | Key | Default | Extra          |
+-------------+----------------------+------+-----+---------+----------------+
| id          | smallint(6) unsigned | NO   | PRI | NULL    | auto_increment |
| censortype  | smallint(6)          | NO   |     | 1       |                |
| find        | varchar(120)         | NO   | UNI |         |                |
| replacement | varchar(255)         | NO   |     |         |                |
| extra       | varchar(255)         | NO   |     |         |                |
| uptime      | int(11)              | YES  |     | NULL    |                |
| enable      | int(1)               | NO   |     | 1       |                |
+-------------+----------------------+------+-----+---------+----------------+
7 rows in set (0.01 sec)

由于有6000多关键字,使用 foreach 的 strstr?还是preg_match ?
追求效率,每小时提交量为10万多文章。


刚刚写的一种:

phpnamespace app\helpers;


use app\models\other\Censor;
use app\models\other\CensorLog;

class CensorHelper
{
    public $id;
    public $data;
    public $match_banned;
    public $match_censor;

    public function __construct($id = 'censor')
    {
        $this->id = $id;
        $this->match_banned = [];
        $this->match_censor = [];
        $this->data = $this->getData();
    }

    /**
     * @description 获取正则表达式
     * @return array|mixed
     */
    public function getData()
    {
        $data = \Yii::$app->cache->get($this->id);
        if (empty($data)) {
            $words = Censor::find()
                ->where(['enable' => 1])
                ->andWhere([' != ', 'replacement', ''])
                ->orderBy(['replacement' => SORT_ASC, 'find' => SORT_DESC])
                ->asArray()
                ->all();

            $censor = [];
            $banned = [];
            $replace = [];
            foreach ($words as $row) {
                switch ($row['replacement']) {
                    case '{censor}':
                        $censor[] = $row['find'];
                        break;

                    case '{banned}':
                        $banned[] = $row['find'];
                        break;

                    default:
                        $replace['from'][] = $row['replacement'];
                        $replace['to'][] = $row['find'];
                        break;
                }
            }

            if ($censor || $banned) {
                $data = [
                    'censor' => $this->generateRegularExpression($censor),
                    'banned' => $this->generateRegularExpression($banned),
                    'replace' => $replace,
                ];

                \Yii::$app->cache->set($this->id, $data);
            }
        }

        return $data;
    }

    /**
     * @describe 生成正则表达式
     * @param array $words
     * @return string
     */
    public function generateRegularExpression(array $words)
    {
        $regular = implode('|', array_map('preg_quote', $words));

        return "/$regular/i";
    }

    public function check($string)
    {
        $this->banned($string);
        $this->censor($string);
    }

    public function censor($string)
    {
        if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) {
            $this->match_censor = array_merge($this->match_censor, $matches[0]);
        }
    }

    public function banned($string)
    {
        if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) {
            $this->match_banned = array_merge($this->match_banned, $matches[0]);
        }
    }

    //重新加载
    public function flush()
    {
        \Yii::$app->cache->delete($this->id);
        $this->getData();
    }

    /**
     * @describe 替换
     * @param $string
     * @return mixed
     */
    public function replace($string)
    {
        return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string;
    }


    /**
     * @return string
     */
    public function getLevel()
    {
        if (!empty($this->match_banned)) {
            return 'banned';

        } else if (!empty($this->match_censor)) {
            return 'censor';

        } else {
            return 'pass';
        }
    }

    /**
     * @describe 添加记录
     * @param $tableId
     * @param $dataId
     */
    public function addLog($tableId, $dataId)
    {
        $log = new CensorLog();
        $log->datatb = $tableId;
        $log->dataid = $dataId;
        $log->matchcensor = implode(',', $this->match_censor);
        $log->matchbanned = implode(',', $this->match_banned);
        $log->addtime = time();

        if (!\Yii::$app->user->isGuest) {
            $log->uid = \Yii::$app->user->getId();
            $log->uname = \Yii::$app->user->getUname();
        }

        $log->ip = IpHelper::getIP();
        $log->iploc = IpHelper::getLocation($log->ip);
        $log->save();

    }
}

回复内容:

已有6000条关键字,分3批次。
一批为替换 replace,一批为遇到需要审核 censor,最后一批为遇到就禁止发布banned。
设计数据表如下:

mysql> desc tbl_censor;
+-------------+----------------------+------+-----+---------+----------------+
| Field       | Type                 | Null | Key | Default | Extra          |
+-------------+----------------------+------+-----+---------+----------------+
| id          | smallint(6) unsigned | NO   | PRI | NULL    | auto_increment |
| censortype  | smallint(6)          | NO   |     | 1       |                |
| find        | varchar(120)         | NO   | UNI |         |                |
| replacement | varchar(255)         | NO   |     |         |                |
| extra       | varchar(255)         | NO   |     |         |                |
| uptime      | int(11)              | YES  |     | NULL    |                |
| enable      | int(1)               | NO   |     | 1       |                |
+-------------+----------------------+------+-----+---------+----------------+
7 rows in set (0.01 sec)

由于有6000多关键字,使用 foreach 的 strstr?还是preg_match ?
追求效率,每小时提交量为10万多文章。


刚刚写的一种:

phpnamespace app\helpers;


use app\models\other\Censor;
use app\models\other\CensorLog;

class CensorHelper
{
    public $id;
    public $data;
    public $match_banned;
    public $match_censor;

    public function __construct($id = 'censor')
    {
        $this->id = $id;
        $this->match_banned = [];
        $this->match_censor = [];
        $this->data = $this->getData();
    }

    /**
     * @description 获取正则表达式
     * @return array|mixed
     */
    public function getData()
    {
        $data = \Yii::$app->cache->get($this->id);
        if (empty($data)) {
            $words = Censor::find()
                ->where(['enable' => 1])
                ->andWhere([' != ', 'replacement', ''])
                ->orderBy(['replacement' => SORT_ASC, 'find' => SORT_DESC])
                ->asArray()
                ->all();

            $censor = [];
            $banned = [];
            $replace = [];
            foreach ($words as $row) {
                switch ($row['replacement']) {
                    case '{censor}':
                        $censor[] = $row['find'];
                        break;

                    case '{banned}':
                        $banned[] = $row['find'];
                        break;

                    default:
                        $replace['from'][] = $row['replacement'];
                        $replace['to'][] = $row['find'];
                        break;
                }
            }

            if ($censor || $banned) {
                $data = [
                    'censor' => $this->generateRegularExpression($censor),
                    'banned' => $this->generateRegularExpression($banned),
                    'replace' => $replace,
                ];

                \Yii::$app->cache->set($this->id, $data);
            }
        }

        return $data;
    }

    /**
     * @describe 生成正则表达式
     * @param array $words
     * @return string
     */
    public function generateRegularExpression(array $words)
    {
        $regular = implode('|', array_map('preg_quote', $words));

        return "/$regular/i";
    }

    public function check($string)
    {
        $this->banned($string);
        $this->censor($string);
    }

    public function censor($string)
    {
        if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) {
            $this->match_censor = array_merge($this->match_censor, $matches[0]);
        }
    }

    public function banned($string)
    {
        if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) {
            $this->match_banned = array_merge($this->match_banned, $matches[0]);
        }
    }

    //重新加载
    public function flush()
    {
        \Yii::$app->cache->delete($this->id);
        $this->getData();
    }

    /**
     * @describe 替换
     * @param $string
     * @return mixed
     */
    public function replace($string)
    {
        return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string;
    }


    /**
     * @return string
     */
    public function getLevel()
    {
        if (!empty($this->match_banned)) {
            return 'banned';

        } else if (!empty($this->match_censor)) {
            return 'censor';

        } else {
            return 'pass';
        }
    }

    /**
     * @describe 添加记录
     * @param $tableId
     * @param $dataId
     */
    public function addLog($tableId, $dataId)
    {
        $log = new CensorLog();
        $log->datatb = $tableId;
        $log->dataid = $dataId;
        $log->matchcensor = implode(',', $this->match_censor);
        $log->matchbanned = implode(',', $this->match_banned);
        $log->addtime = time();

        if (!\Yii::$app->user->isGuest) {
            $log->uid = \Yii::$app->user->getId();
            $log->uname = \Yii::$app->user->getUname();
        }

        $log->ip = IpHelper::getIP();
        $log->iploc = IpHelper::getLocation($log->ip);
        $log->save();

    }
}

trie 树算法最适合。

PHP 关键词过滤扩展,该扩展依赖于 libdatrie(Trie 算法的 C++ 实现)。

你这个敏感词匹配,不需要用到正则,只用简单的匹配或者替换就行了。

关键字分成三类存memcached。

然后对文章进行匹配,应该从最严厉的banned来匹配,接着是要censor的关键字,最后才是可以replace的敏感词。

1 遇到就禁止发布 => str_pos
2 遇到需要审核 => str_pos
3 替换 => str_replace