* * @copyright Copyright (C) 2008-2019, Yoast BV * The following code is a derivative work of the code from the Yoast(https://github.com/Yoast/wordpress-seo/), which is licensed under GPL v3. */ namespace RankMath\Sitemap; use WP_Query; use DOMDocument; use RankMath\Helper; use RankMath\Helpers\Attachment; use RankMath\Helpers\Str; use RankMath\Helpers\Url; use RankMath\Helpers\Arr; use RankMath\Traits\Hooker; defined( 'ABSPATH' ) || exit; /** * Image_Parser class. */ class Image_Parser { use Hooker; /** * Holds the `home_url()` value to speed up loops. * * @var string */ protected $home_url = ''; /** * Holds site URL hostname. * * @var string */ protected $host = ''; /** * Holds site URL protocol. * * @var string */ protected $scheme = 'http'; /** * Cached set of attachments for multiple posts. * * @var array */ protected $attachments = []; /** * Holds blog charset value for use in DOM parsing. * * @var string */ protected $charset = 'UTF-8'; /** * Hold post. * * @var object */ private $post = null; /** * Hold parsed images data. * * @var array */ private $images = []; /** * Set up URL properties for reuse. */ public function __construct() { $this->home_url = home_url(); $parsed_home = wp_parse_url( $this->home_url ); if ( ! empty( $parsed_home['host'] ) ) { $this->host = str_replace( 'www.', '', $parsed_home['host'] ); } if ( ! empty( $parsed_home['scheme'] ) ) { $this->scheme = $parsed_home['scheme']; } $this->charset = esc_attr( get_bloginfo( 'charset' ) ); } /** * Get set of image data sets for the given post. * * @param object $post Post object to get images for. * * @return array */ public function get_images( $post ) { if ( ! Helper::get_settings( 'sitemap.include_images' ) ) { return false; } $this->post = $post; if ( ! is_object( $this->post ) ) { return $this->images; } $this->get_post_thumbnail(); $this->get_post_images(); $this->get_post_galleries(); $this->get_is_attachment(); $this->get_custom_field_images(); // Reset. $images = $this->images; $this->images = []; $this->post = null; /** * Filter images to be included for the post in XML sitemap. * * @param array $images Array of image items. * @param int $post_id ID of the post. */ return $this->do_filter( 'sitemap/urlimages', $images, $post->ID ); } /** * Get term images. * * @param object $term Term to get images from description for. * * @return array */ public function get_term_images( $term ) { if ( ! Helper::get_settings( 'sitemap.include_images' ) ) { return false; } $images = $this->parse_html_images( $term->description ); foreach ( $this->parse_galleries( $term->description ) as $attachment ) { $images[] = [ 'src' => $this->get_absolute_url( $this->image_url( $attachment->ID ) ), ]; } return $images; } /** * Get post thumbnail. */ private function get_post_thumbnail() { $thumbnail_id = get_post_thumbnail_id( $this->post->ID ); if ( ! Helper::get_settings( 'sitemap.include_featured_image' ) || ! Attachment::attachment_in_sitemap( $thumbnail_id ) ) { return; } $this->get_image_item( $this->get_absolute_url( $this->image_url( $thumbnail_id ) ) ); } /** * Get images from post content. */ private function get_post_images() { /** * Filter: 'rank_math/sitemap/content_before_parse_html_images' - Filters the post content * before it is parsed for images. * * @param string $content The raw/unprocessed post content. */ $content = $this->do_filter( 'sitemap/content_before_parse_html_images', $this->post->post_content, $this->post->ID ); $content = do_blocks( $content ); foreach ( $this->parse_html_images( $content ) as $image ) { $this->get_image_item( $image['src'] ); } } /** * Get post galleries. */ private function get_post_galleries() { foreach ( $this->parse_galleries( $this->post->post_content, $this->post->ID ) as $attachment ) { $this->get_image_item( $this->get_absolute_url( $this->image_url( $attachment->ID ) ) ); } } /** * Get image if post is attachment. */ private function get_is_attachment() { if ( 'attachment' === $this->post->post_type && wp_attachment_is_image( $this->post ) ) { $this->get_image_item( $this->get_absolute_url( $this->image_url( $this->post->ID ) ) ); } } /** * Get images from custom fields. */ private function get_custom_field_images() { $customs = Helper::get_settings( 'sitemap.pt_' . $this->post->post_type . '_image_customfields' ); if ( empty( $customs ) ) { return; } $customs = Arr::from_string( $customs, "\n" ); foreach ( $customs as $key ) { $src = get_post_meta( $this->post->ID, $key, true ); if ( Str::is_non_empty( $src ) && Helper::is_image_url( $src ) ) { $this->get_image_item( $src ); } } } /** * Parse `` tags in content. * * @param string $content Content string to parse. * * @return array */ private function parse_html_images( $content ) { $images = []; $document = $this->get_document( $content ); if ( false === $document ) { return $images; } foreach ( $document->getElementsByTagName( 'img' ) as $img ) { $src = $this->get_image_src( $img ); if ( false === $src ) { continue; } $images[] = [ 'src' => $src ]; } return $images; } /** * Get DOM document. * * @param string $content Content to parse. * * @return bool|DOMDocument */ private function get_document( $content ) { if ( ! class_exists( 'DOMDocument' ) || empty( $content ) ) { return false; } // Prevent DOMDocument from bubbling warnings about invalid HTML. libxml_use_internal_errors( true ); $post_dom = new DOMDocument(); $post_dom->loadHTML( 'charset . '">' . $content ); // Clear the errors, so they don't get kept in memory. libxml_clear_errors(); return $post_dom; } /** * Get image source from node. * * @param DOMNode $node Node instance. * * @return bool|string */ private function get_image_src( $node ) { $src = $node->getAttribute( 'src' ); if ( $node->hasAttribute( 'data-sitemapexclude' ) || empty( $src ) ) { return false; } $class = $node->getAttribute( 'class' ); if ( // This detects WP-inserted images, which we need to upsize. R. ! empty( $class ) && ! Str::contains( 'size-full', $class ) && preg_match( '|wp-image-(?P\d+)|', $class, $matches ) && get_post_status( $matches['id'] ) ) { $src = $this->image_url( $matches['id'] ); } $src = $this->get_absolute_url( $src ); $no_host = esc_url( $src ) !== $src; if ( ! $this->do_filter( 'sitemap/include_external_image', false ) ) { $no_host = ! Str::contains( $this->host, $src ) || esc_url( $src ) !== $src; } return $no_host ? false : $src; } /** * Parse gallery shortcodes in a given content. * * @param string $content Content string. * @param int $post_id Optional ID of post being parsed. * * @return array Set of attachment objects. */ private function parse_galleries( $content, $post_id = 0 ) { $attachments = []; $galleries = $this->get_content_galleries( $content ); foreach ( $galleries as $gallery ) { $id = $post_id; if ( ! empty( $gallery['id'] ) ) { $id = intval( $gallery['id'] ); } // Forked from core gallery_shortcode() to have exact same logic. R. if ( ! empty( $gallery['ids'] ) ) { $gallery['include'] = $gallery['ids']; } $attachments = array_merge( $attachments, $this->get_gallery_attachments( $id, $gallery ) ); } return array_unique( $attachments, SORT_REGULAR ); } /** * Retrieves galleries from the passed content. * Forked from core to skip executing shortcodes for performance. * * @param string $content Content to parse for shortcodes. * * @return array A list of arrays, each containing gallery data. */ private function get_content_galleries( $content ) { if ( ! preg_match_all( '/' . get_shortcode_regex( [ 'gallery' ] ) . '/s', $content, $matches, PREG_SET_ORDER ) ) { return []; } $galleries = []; foreach ( $matches as $shortcode ) { if ( 'gallery' !== $shortcode[2] ) { continue; } $attributes = shortcode_parse_atts( $shortcode[3] ); $galleries[] = '' === $attributes ? [] : $attributes; } return $galleries; } /** * Set image item array with filters applied. * * @param string $src Image URL. */ private function get_image_item( $src ) { $image = []; /** * Filter image URL to be included in XML sitemap for the post. * * @param string $src Image URL. * @param object $post Post object. */ $image['src'] = $this->do_filter( 'sitemap/xml_img_src', $src, $this->post ); if ( Str::is_empty( $image['src'] ) ) { return; } /** * Filter image data to be included in XML sitemap for the post. * * @param array $image Array of image data. { * @type string $src Image URL. * } * @param object $post Post object. */ $this->images[] = $this->do_filter( 'sitemap/xml_img', $image, $this->post ); } /** * Get attached image URL with filters applied. Adapted from core for speed. * * @param int $post_id ID of the post. * * @return string */ private function image_url( $post_id ) { $src = $this->normalize_image_url( $post_id ); return false === $src ? '' : apply_filters( 'wp_get_attachment_url', $src, $post_id ); // phpcs:ignore } /** * Get attached image URL. * * @param int $post_id ID of the post. * * @return bool|string */ private function normalize_image_url( $post_id ) { $uploads = $this->get_upload_dir(); $attachment = get_post_meta( $post_id, '_wp_attached_file', true ); if ( false !== $uploads['error'] || empty( $attachment ) ) { return false; } // Check that the upload base exists in the file location. if ( 0 === strpos( $attachment, $uploads['basedir'] ) ) { return str_replace( $uploads['basedir'], $uploads['baseurl'], $attachment ); } if ( false !== strpos( $attachment, 'wp-content/uploads' ) ) { return $uploads['baseurl'] . substr( $attachment, ( strpos( $attachment, 'wp-content/uploads' ) + 18 ) ); } // It's a newly uploaded file, therefore $attachment is relative to the baseurl. return $uploads['baseurl'] . '/' . $attachment; } /** * Get WordPress upload directory. * * @return bool|array */ private function get_upload_dir() { static $rank_math_wp_uploads; if ( empty( $rank_math_wp_uploads ) ) { $rank_math_wp_uploads = wp_upload_dir(); } return $rank_math_wp_uploads; } /** * Make absolute URL for domain or protocol-relative one. * * @param string $src URL to process. * * @return string */ private function get_absolute_url( $src ) { if ( Str::is_empty( $src ) ) { return $src; } if ( true === Url::is_relative( $src ) ) { return '/' !== $src[0] ? $src : $this->home_url . $src; // The URL is relative, we'll have to make it absolute. } // If not starting with protocol, we add the scheme as the standard requires a protocol. return ! Str::starts_with( 'http', $src ) ? $this->scheme . ':' . $src : $src; } /** * Returns the attachments for a gallery. * * @param int $id The post ID. * @param array $gallery The gallery config. * * @return array The selected attachments. */ private function get_gallery_attachments( $id, $gallery ) { // When there are attachments to include. if ( ! empty( $gallery['include'] ) ) { return $this->get_gallery_attachments_for_included( $gallery['include'] ); } return empty( $id ) ? [] : $this->get_gallery_attachments_for_parent( $id, $gallery ); } /** * Returns the attachments for the given ID. * * @param int $id The post ID. * @param array $gallery The gallery config. * * @return array The selected attachments. */ private function get_gallery_attachments_for_parent( $id, $gallery ) { $query = [ 'posts_per_page' => -1, 'post_parent' => $id, ]; // When there are posts that should be excluded from result set. if ( ! empty( $gallery['exclude'] ) ) { $query['post__not_in'] = wp_parse_id_list( $gallery['exclude'] ); } return $this->get_attachments( $query ); } /** * Returns an array with attachments for the post IDs that will be included. * * @param array $include Array with ids to include. * * @return array The found attachments. */ private function get_gallery_attachments_for_included( $include ) { $ids_to_include = wp_parse_id_list( $include ); $attachments = $this->get_attachments( [ 'posts_per_page' => count( $ids_to_include ), 'post__in' => $ids_to_include, ] ); $gallery_attachments = []; foreach ( $attachments as $val ) { $gallery_attachments[ $val->ID ] = $val; } return $gallery_attachments; } /** * Returns the attachments. * * @param array $args Array with query args. * * @return array The found attachments. */ protected function get_attachments( $args ) { $default_args = [ 'post_status' => 'inherit', 'post_type' => 'attachment', 'post_mime_type' => 'image', // Defaults taken from function get_posts. 'orderby' => 'date', 'order' => 'DESC', 'meta_key' => '', 'meta_value' => '', 'suppress_filters' => true, 'ignore_sticky_posts' => true, 'no_found_rows' => true, ]; $args = wp_parse_args( $args, $default_args ); $get_attachments = new WP_Query(); return $get_attachments->query( $args ); } }