php_Document.php

<?php

namespace FindStr;

use WP_Error;

class Document {

  public int $ID;

  private \WP_Post $post;

  public string $post_title;

  public string $post_content;

  public string $post_name;

  public string $permalink;

  public $language;

  public array $taxonomies;

  public string $post_date;

  public string $post_modified;

  public string $post_parent;

  public string $post_type;

  public array $featured_image;

  public int $weight;

  public int $sticky = 0;

  public array $to_index;

  private string $post_author;

  /**
   * @throws \Exception
   */
  public function __construct( $post_id ) {

    $post = get_post( $post_id );

    if ( empty( $post ) ) {
      throw new \Exception( 'Post not found' );
    }

    $this->post = $post;

    $this->ID = $post->ID;

    $this->post_title = html_entity_decode( get_the_title( $post ), ENT_QUOTES | ENT_HTML5, 'UTF-8' );

    $this->post_content = self::prepare_content( $this->post->post_content );

    $this->post_date = $this->post->post_date;

    $this->post_name = $post->post_name;

    $this->taxonomies = $this->post_add_taxonomies();

    $this->post_modified = $this->post->post_modified;

    $this->permalink = wp_parse_url( get_permalink( $this->post->ID ), PHP_URL_PATH );

    $this->post_type = $this->post->post_type;

    $this->post_parent = empty( $post->post_parent ) ? '' : get_the_title( $post->post_parent );

    $this->post_author = get_the_author_meta( 'display_name', $post->post_author );

    $this->featured_image = $this->get_image_data(
      /**
       * Filter the featured image ID
       *
       * @hook findstr_featured_image_id
       *
       * @param {int} $featured_image_id the featured image ID
       * @param {WP_Post} $post the post object
       *
       * @returns {int} $featured_image_id the featured image ID
       *
       */
      apply_filters(
        'findstr_featured_image_id',
        get_post_thumbnail_id( $post ),
        $post
      )
    );

    $this->language = $this->get_language_code();

    $this->sticky = (int) is_sticky( $post->ID );

    $this->to_index = $this->get_fields_to_index();

    $this->update_document_weight();

    /**
     * Filter the data to index
     *
     * @hook findstr_data_to_index
     *
     * @param {array} $fields the fields to index
     * @param {WP_Post} $post the post object
     *
     * @returns {array} $data the data to index
     *
     */
    $this->to_index = apply_filters( 'findstr_data_to_index', $this->to_index, $this->post );

  }

  public function get_post_type( $post_type ): string {
    $post_type_object = get_post_type_labels( get_post_type_object( $post_type ) );

    return $post_type_object->name;
  }



  public function update_document_weight(): void {

    $settings_weight_management = (array) ( new \FindStr\SettingsWeightManagement() )->get();

    $weights = array();

    //get attributes to use for weight
    foreach ( $settings_weight_management as $weight_rules ) {
      if ( ! empty( $this->to_index[ $weight_rules->name ] ) && $weight_rules->value === $this->to_index[ $weight_rules->name ] ) {
        $weights[] = $weight_rules->weight;
      }
    }

    if ( empty( $weights ) ) {
      $weights[] = 10;
    }

    /**
     * Filter the document weight
     *
     * @hook findstr_document_weight
     *
     * @param {int} $weight the weight of the document
     * @param {WP_Post} $post the post object
     *
     * @returns {int} $weight the weight of the document
     *
     */
    $this->to_index['weight'] = apply_filters( 'findstr_document_weight', array_sum( $weights ), $this->post );
  }

  public function warning_handler( $error_no, $error_str, $error_file, $error_line ) {
    new Log(
      $error_str,
      'warning',
      array(
        'function'   => 'Document->prepare_content',
        'ID'         => $this->ID,
        'error_no'   => $error_no,
        'error_file' => $error_file,
        'error_line' => $error_line,
      )
    );
  }

  /**
   * @param $content
   *
   * @return string
   */
  public function prepare_content( $content ) : string {

    //set error handler to catch errors
    //phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_set_error_handler
    set_error_handler( array( $this, 'warning_handler' ), E_WARNING | E_NOTICE | E_USER_WARNING | E_USER_NOTICE | E_STRICT | E_DEPRECATED | E_USER_DEPRECATED );

    ob_start();

    try {
      echo apply_filters( 'the_content', $content );//phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped
    } catch ( \Error $e ) {
      new Log(
        $e->getMessage(),
        'error',
        array(
          'function' => 'Document->prepare_content',
          'ID'       => $this->ID,
        )
      );
    }

    $content = ob_get_contents();
    ob_end_clean();

    restore_error_handler();

    $content = $this->strip_all_tags( $content );
    $content = $this->strip_invisibles( $content );

    //remove repeating consecutive words
    $content = preg_replace( '/\b(\S+)(?:\s+\1\b)+/i', '$1', $content );

    return wp_kses_post( trim( $content ) );
  }

  /**
   * Get fields to index
   *
   * @return array
   */
  public function get_fields_to_index() : array {

    $indexable_fields = (array) ( new SettingsIndexableFields() )->get();

    $fields = array(
      'ID' => $this->ID,
    );

    foreach ( $indexable_fields as $key => $field ) {

      if ( strpos( $field->id, 'cf/' ) === 0 ) { // custom fields
        $meta_key = str_replace( 'cf/', '', $field->id );

        if ( metadata_exists( 'post', $this->ID, $meta_key ) ) {
          $fields[ $meta_key ] = get_post_meta( $this->ID, $meta_key, true ); //todo : add multiple values
        }
      } elseif ( strpos( $field->id, 'acf/' ) === 0 ) { //advanced custom fields
        $meta_key = str_replace( 'acf/', '', $field->id );

        if ( function_exists( 'get_field' ) ) {
          $fields[ $meta_key ] = get_field( $meta_key, $this->ID, true );
        }
      } elseif ( strpos( $field->id, 'tax/' ) === 0 ) { //taxonomies
        $taxonomy = str_replace( 'tax/', '', $field->id );

        //check if post is associated with taxonomy
        if ( ! is_object_in_taxonomy( $this->post_type, $taxonomy ) ) {
          continue;
        }

        $post_terms = wp_get_post_terms( $this->ID, $taxonomy, array( 'fields' => 'names' ) );
        $post_terms = array_map(
          function( $term ) {
            return html_entity_decode( $term, ENT_QUOTES | ENT_HTML5, 'UTF-8' );
          },
          $post_terms
        );

        /**
         * Filter the post terms.
         * This filter runs for each taxonomy to index based on indexable fields settings.
         *
         * @hook findstr_post_terms
         *
         * @param {array} $post_terms the terms to index
         * @param {string} $taxonomy the taxonomy name
         * @param {WP_Post} $post the post object
         *
         * @returns {array} $post_terms the terms to index
         *
         */
        $fields[ $taxonomy ] = apply_filters( 'findstr_post_terms', $post_terms, $taxonomy, $this->post );

      } else {

        if ( property_exists( $this, $field->id ) ) {
          $fields[ $field->id ] = $this->{$field->id};
        }
      }

      /**
       *
       * This filter runs for each field to index based on indexable fields settings.
       * It allows to modify the fields to index.
       * Note: $fields does not contain taxonomies and permalink, language and featured_image are added later.
       * This filter applies in a loop, $fields array is not fully built yet.
       *
       * @hook findstr_indexable_fields_loop_to_index
       *
       * @param {array} $fields the fields to index
       * @param {object} $field the field object
       * @param {WP_Post} $post the post object
       *
       * @returns {array} $fields the fields to index
       */
      $fields = apply_filters( 'findstr_indexable_fields_loop_to_index', $fields, $field, $this->post );

    }

    $fields['permalink'] = ! empty( $fields['permalink'] ) ? $fields['permalink'] : $this->permalink;
    $fields['language']  = ! empty( $fields['language'] ) ? $fields['language'] : $this->language;

    if ( is_array( $fields['language'] ) ) {
      $fields['language']   = array_values( $fields['language'] );
      $fields['permalinks'] = array();
      foreach ( $fields['language'] as $language ) {
        $fields['permalinks'][ $language ] = apply_filters( 'wpml_permalink', get_permalink( $this->post ), $language );
      }
    }

    $fields['featured_image'] = ! empty( $fields['featured_image'] ) ? $fields['featured_image'] : $this->featured_image;

    //apply filters for each field
    foreach ( $fields as $key => $field ) {
      /**
       * Filter the indexable field.
       * This filter runs for each field to index based on indexable fields settings.
       *
       * @hook findstr_indexable_field
       *
       * @param {mixed} $field the field value
       * @param {string} $key the field key
       * @param {WP_Post} $post the post object
       *
       * @returns {mixed} $field the field value to index
       */
      $fields[ $key ] = apply_filters( 'findstr_indexable_field', $field, $key, $this->post );
    }

    $fields['sticky'] = $this->sticky;

    $fields['menu_order'] = ! empty( $fields['menu_order'] ) ? $fields['menu_order'] : $this->post->menu_order;

    return $fields;
  }

  /**
   * Get language code of a post.
   *
   * @return mixed|string
   */
  public function get_language_code() {

    $post = $this->post;

    $language_code = Helpers::get_language_code_by_post_id( $post->ID );

    /**
     * Filter the document language.
     * with this filter you can change the language code of a post.
     *
     * @hook findstr_document_language
     *
     * @param {string|array} $language_code the language code
     * @param {WP_Post} $post the post object
     *
     * @returns {string|array} $language_code the language code
     */
    return apply_filters( 'findstr_document_language', $language_code, $post );
  }

  /**
   * Strip invisible tags from text
   *
   * @param $text
   *
   * @return array|string|string[]|null
   */
  public function strip_invisibles( $text ) {
    if ( ! is_string( $text ) ) {
      $text = strval( $text );
    }

    // decode html entities
    $text = html_entity_decode( $text, ENT_QUOTES, 'UTF-8' );
    // remove multiple spaces
    $text = preg_replace(
      "/(\t|\n|\v|\f|\r| |\xC2\x85|\xc2\xa0|\xe1\xa0\x8e|\xe2\x80[\x80-\x8D]|\xe2\x80\xa8|\xe2\x80\xa9|\xe2\x80\xaF|\xe2\x81\x9f|\xe2\x81\xa0|\xe3\x80\x80|\xef\xbb\xbf)+/",
      ' ',
      $text
    );

    /**
     * Filter the text after stripping invisible tags
     *
     * @hook findstr_strip_invisible
     *
     * @param {string} $text already stripped text
     *
     * @returns {string} $text
     */
    return apply_filters( 'findstr_strip_invisible', $text );
  }

  /**
   * Strip all tags
   *
   * @param string $content the post content.
   *
   * @return string
   */
  public function strip_all_tags( $content ) : string {
    if ( ! is_string( $content ) ) {
      $content = '';
    }

    // remove invisible tags
    $content = preg_replace(
      array(
        '@<style[^>]*?>.*?</style>@siu',
        '@<script[^>]*?.*?</script>@siu',
        '@<object[^>]*?.*?</object>@siu',
        '@<embed[^>]*?.*?</embed>@siu',
        '@<applet[^>]*?.*?</applet>@siu',
        '@<noscript[^>]*?.*?</noscript>@siu',
        '@<noembed[^>]*?.*?</noembed>@siu',
        '@<iframe[^>]*?.*?</iframe>@siu',
        '@<del[^>]*?.*?</del>@siu',
        '@<!--.*?-->@siu',
      ),
      ' ',
      $content
    );

    $content = str_replace( '<', ' <', $content ); //this helps to keep white spaces between tags
    $content = wp_strip_all_tags( $content );
    $content = preg_replace( '/<!--.*?-->/ms', ' ', $content );
    $content = preg_replace( '/<[!a-zA-Z\/][^>].*?>/ms', ' ', $content );
    $content = preg_replace( '/\s+/', ' ', $content );

    return $content;
  }

  /**
   * Get the image data for a given attachment ID
   *
   * @param $attachment_id
   *
   * @return array
   */
  public function get_image_data( $attachment_id ): array {

    $image_meta = wp_get_attachment_metadata( $attachment_id );

    if ( ! $image_meta ) {
      return array();
    }

    $image_data = array(
      'title'   => get_the_title( $attachment_id ),
      'caption' => get_post_field( 'post_excerpt', $attachment_id ),
      'alt'     => get_post_meta( $attachment_id, '_wp_attachment_image_alt', true ),
      'url'     => wp_get_attachment_url( $attachment_id ),
    );

    if ( ! empty( $image_meta['sizes'] ) && is_array( $image_meta['sizes'] ) ) {
      foreach ( $image_meta['sizes'] as $size => $size_data ) {

        // Obtenir l'URL de l'image à cette taille
        $image_url = wp_get_attachment_image_src( $attachment_id, $size );

        $image_data[ $size ] = array(
          'url'    => $image_url[0],
          'width'  => $image_url[1],
          'height' => $image_url[2],
        );
      }
    }

    return $image_data;
  }


  /**
   * Add taxonomies to the document
   *
   * @return array
   */
  private function post_add_taxonomies(): array {

    $taxonomies = get_object_taxonomies( $this->post->post_type );
    $taxonomies = array_diff( $taxonomies, array( 'translation_priority' ) );

    /**
     * Filter the document taxonomies.
     * List of taxonomies added to document.
     *
     * @hook findstr_document_taxonomies
     *
     * @param {array} $taxonomies the taxonomies
     * @param {WP_Post} $post the post object
     *
     * @returns {array} $taxonomies the taxonomies
     */

    return (array) apply_filters( 'findstr_document_taxonomies', $taxonomies, $this->post );
  }
}