> ## Documentation Index
> Fetch the complete documentation index at: https://private-7c7dfe99-mintlify-fbfa8bee.mintlify.site/llms.txt
> Use this file to discover all available pages before exploring further.

> Amazon 제품에 대한 1억 5천만 개 이상의 고객 리뷰

# Amazon 고객 리뷰

export const RunnableCode = ({children, run = false, showStats = true}) => {
  const [results, setResults] = useState(null);
  const [error, setError] = useState(null);
  const [loading, setLoading] = useState(false);
  const [showResults, setShowResults] = useState(false);
  const [stats, setStats] = useState(null);
  const [isDark, setIsDark] = useState(false);
  const [hoveredRow, setHoveredRow] = useState(-1);
  const codeRef = useRef(null);
  useEffect(() => {
    if (typeof window !== "undefined") {
      const check = () => setIsDark(document.documentElement.classList.contains("dark"));
      check();
      const observer = new MutationObserver(check);
      observer.observe(document.documentElement, {
        attributes: true,
        attributeFilter: ["class"]
      });
      return () => observer.disconnect();
    }
  }, []);
  useEffect(() => {
    if (codeRef.current) {
      const block = codeRef.current.querySelector(".code-block");
      if (block) {
        block.style.marginBottom = "0";
        block.style.marginTop = "0";
        block.style.borderBottomLeftRadius = "0";
        block.style.borderBottomRightRadius = "0";
      }
    }
  });
  const getSqlText = () => {
    if (!codeRef.current) return "";
    const code = codeRef.current.querySelector("code");
    return (code || codeRef.current).textContent.trim();
  };
  const executeQuery = async () => {
    const sql = getSqlText();
    if (!sql) return;
    setLoading(true);
    setError(null);
    setResults(null);
    setShowResults(true);
    try {
      const cleanQuery = sql.replace(/;$/, "").trim();
      const params = new URLSearchParams({
        query: cleanQuery,
        default_format: "JSONCompact",
        result_overflow_mode: "break",
        read_overflow_mode: "break",
        allow_experimental_analyzer: "1"
      });
      const res = await fetch(`https://sql-clickhouse.clickhouse.com/?${params.toString()}`, {
        method: "POST",
        headers: {
          Authorization: `Basic ${btoa(`demo:`)}`
        }
      });
      const text = await res.text();
      if (!res.ok) {
        setError(text || `HTTP ${res.status}`);
        setLoading(false);
        return;
      }
      const json = JSON.parse(text);
      setResults(json);
      setStats(json.statistics || null);
    } catch (err) {
      setError(err.message || "쿼리 실행에 실패했습니다");
    }
    setLoading(false);
  };
  useEffect(() => {
    if (run) executeQuery();
  }, []);
  const formatRows = n => {
    if (n >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
    if (n >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
    if (n >= 1e3) return `${(n / 1e3).toFixed(1)}K`;
    return String(n);
  };
  const formatBytes = b => {
    if (b >= 1e9) return `${(b / 1e9).toFixed(2)} GB`;
    if (b >= 1e6) return `${(b / 1e6).toFixed(2)} MB`;
    if (b >= 1e3) return `${(b / 1e3).toFixed(2)} KB`;
    return `${b} B`;
  };
  const isNumericType = type => {
    return (/^(UInt|Int|Float|Decimal)/).test(type);
  };
  const isHyperlink = value => {
    return typeof value === "string" && (/^https?:\/\//).test(value);
  };
  const computeColumnExtremes = (meta, data) => {
    const extremes = {};
    for (let i = 0; i < meta.length; i++) {
      if (isNumericType(meta[i].type)) {
        let min = Infinity, max = -Infinity;
        for (const row of data) {
          const v = Number(row[i]);
          if (!isNaN(v)) {
            if (v < min) min = v;
            if (v > max) max = v;
          }
        }
        if (max > -Infinity) {
          extremes[i] = {
            min,
            max
          };
        }
      }
    }
    return extremes;
  };
  const computeColumnWidths = (meta, data) => {
    const lengths = meta.map((col, i) => {
      const headerLen = col.name.length + col.type.length + 1;
      let maxData = 0;
      for (const row of data) {
        const v = row[i];
        const len = v === null ? 4 : String(v).length;
        if (len > maxData) maxData = len;
      }
      return Math.max(headerLen, maxData);
    });
    const total = lengths.reduce((s, l) => s + l, 0);
    return lengths.map(l => `${(l / total * 100).toFixed(1)}%`);
  };
  const copyResultsAsTSV = () => {
    if (!results || !results.meta || !results.data) return;
    const header = results.meta.map(col => col.name).join("\t");
    const rows = results.data.map(row => row.map(cell => cell === null ? "NULL" : String(cell)).join("\t"));
    const tsv = [header, ...rows].join("\n");
    navigator.clipboard.writeText(tsv);
  };
  const borderColor = isDark ? "rgba(255,255,255,0.15)" : "#e5e7eb";
  const bgColor = isDark ? "rgba(255,255,255,0.05)" : "#f9fafb";
  const headerBg = isDark ? "#2a2a2a" : "#f3f4f6";
  const textColor = isDark ? "#e5e7eb" : "#1f2937";
  const mutedColor = isDark ? "#d1d5db" : "#6b7280";
  const accentColor = isDark ? "#FAFF69" : "#323232";
  const accentTextColor = isDark ? "#000" : "#fff";
  const barColor = isDark ? "#35372f" : "#d2d2d2";
  const cellBg = isDark ? "#1f201b" : "#ffffff";
  const cellBgHover = isDark ? "lch(15.8 0 0)" : "#f0f0f0";
  const extremes = results && results.meta && results.data ? computeColumnExtremes(results.meta, results.data) : {};
  const colWidths = results && results.meta && results.data ? computeColumnWidths(results.meta, results.data) : [];
  const getCellBarStyle = (cell, ci, ri) => {
    if (cell === null) return null;
    const colMeta = results.meta[ci];
    if (!isNumericType(colMeta.type) || !extremes[ci] || results.data.length <= 1 || extremes[ci].max <= 0) return null;
    const ratio = 100 * Number(cell) / extremes[ci].max;
    const bg = ri === hoveredRow ? cellBgHover : cellBg;
    return {
      background: `linear-gradient(to right, ${barColor} 0%, ${barColor} ${ratio}%, ${bg} ${ratio}%, ${bg} 100%)`
    };
  };
  const renderCell = (cell, ci) => {
    if (cell === null) {
      return <span style={{
        color: mutedColor,
        fontStyle: "italic"
      }}>NULL</span>;
    }
    const value = String(cell);
    if (isHyperlink(value)) {
      return <a href={value} target="_blank" rel="noopener noreferrer" style={{
        color: accentColor,
        textDecoration: "underline",
        cursor: "pointer"
      }}>
          {value}
        </a>;
    }
    return value;
  };
  return <div className="not-prose" style={{
    margin: "1rem 0",
    width: "100%",
    boxSizing: "border-box",
    contain: "inline-size"
  }}>
      {}
      <div>
        <div ref={codeRef}>{children}</div>

        {}
        <div style={{
    display: "flex",
    justifyContent: "space-between",
    alignItems: "center",
    padding: "6px 12px",
    backgroundColor: headerBg,
    borderWidth: "0 1px 1px 1px",
    borderStyle: "solid",
    borderColor: isDark ? "rgba(255,255,255,0.1)" : "rgba(11,11,11,0.1)",
    borderRadius: "0 0 4px 4px"
  }}>
          <div style={{
    display: "flex",
    alignItems: "center",
    gap: "12px"
  }}>
            {results && <button onClick={() => setShowResults(!showResults)} style={{
    background: "none",
    border: "none",
    cursor: "pointer",
    color: mutedColor,
    fontSize: "12px",
    padding: "2px 4px"
  }}>
                {showResults ? "▼ 결과 숨기기" : "▶ 결과 표시"}
              </button>}
            {showStats && stats && <span style={{
    fontSize: "11px",
    color: mutedColor,
    fontStyle: "italic"
  }}>
                {formatRows(stats.rows_read)}행, {formatBytes(stats.bytes_read)} 읽음 ({stats.elapsed.toFixed(3)}s)
              </span>}
          </div>
          <button onClick={() => executeQuery()} disabled={loading} style={{
    display: "flex",
    alignItems: "center",
    gap: "6px",
    padding: "4px 14px",
    borderRadius: "4px",
    border: "none",
    cursor: loading ? "wait" : "pointer",
    backgroundColor: accentColor,
    color: accentTextColor,
    fontSize: "12px",
    fontWeight: 600
  }}>
            {loading ? <span>실행 중...</span> : <>
                <span style={{
    fontSize: "10px"
  }}>▶</span>
                <span>실행</span>
              </>}
          </button>
        </div>
      </div>

      {}
      {showResults && <div className="not-prose" style={{
    marginTop: "8px",
    maxHeight: "350px",
    overflow: "auto",
    border: `1px solid ${borderColor}`,
    borderRadius: "4px"
  }}>
          <div>
            {loading && <div style={{
    padding: "24px",
    textAlign: "center",
    color: mutedColor
  }}>쿼리 실행 중...</div>}

            {error && <div style={{
    padding: "12px 16px",
    color: "#ef4444",
    backgroundColor: isDark ? "rgba(239,68,68,0.1)" : "#fef2f2",
    fontSize: "13px",
    fontFamily: "monospace",
    whiteSpace: "pre-wrap"
  }}>
                {error}
              </div>}

            {results && results.meta && results.data && <div style={{
    display: "grid",
    gridTemplateColumns: colWidths.join(" "),
    width: "100%",
    fontSize: "13px",
    fontFamily: 'ui-monospace, SFMono-Regular, "SF Mono", Menlo, Consolas, monospace'
  }}>
                {results.meta.map((col, i) => <div key={`h-${i}`} style={{
    position: "sticky",
    top: 0,
    zIndex: 1,
    padding: "6px 12px",
    textAlign: isNumericType(col.type) && results.meta.length > 1 ? "right" : "left",
    backgroundColor: headerBg,
    borderBottom: `1px solid ${borderColor}`,
    color: textColor,
    fontWeight: 600,
    fontSize: "12px",
    whiteSpace: "nowrap",
    overflow: "hidden",
    textOverflow: "ellipsis"
  }}>
                    {col.name}
                    <span style={{
    color: mutedColor,
    fontWeight: 400,
    marginLeft: "4px",
    fontSize: "10px"
  }}>{col.type}</span>
                  </div>)}
                {results.data.map((row, ri) => row.map((cell, ci) => <div key={`${ri}-${ci}`} onMouseEnter={() => setHoveredRow(ri)} onMouseLeave={() => setHoveredRow(-1)} style={{
    padding: "4px 12px",
    color: textColor,
    whiteSpace: "nowrap",
    overflow: "hidden",
    textOverflow: "ellipsis",
    textAlign: isNumericType(results.meta[ci].type) && results.meta.length > 1 ? "right" : "left",
    borderBottom: `1px solid ${borderColor}`,
    backgroundColor: ri === hoveredRow ? cellBgHover : ri % 2 === 0 ? "transparent" : bgColor,
    transition: "background-color 0.1s",
    ...getCellBarStyle(cell, ci, ri)
  }}>
                      {renderCell(cell, ci)}
                    </div>))}
              </div>}

            {results && results.data && <div style={{
    display: "flex",
    justifyContent: "space-between",
    alignItems: "center",
    padding: "4px 12px",
    fontSize: "11px",
    color: mutedColor,
    borderTop: `1px solid ${borderColor}`,
    backgroundColor: headerBg
  }}>
                <span>
                  {results.rows}행
                </span>
                <button onClick={copyResultsAsTSV} style={{
    background: "none",
    border: "none",
    cursor: "pointer",
    color: mutedColor,
    fontSize: "11px",
    padding: "2px 6px",
    borderRadius: "3px"
  }} onMouseEnter={e => e.target.style.color = textColor} onMouseLeave={e => e.target.style.color = mutedColor}>
                  ⧉ TSV 복사
                </button>
              </div>}
          </div>
        </div>}
    </div>;
};

이 데이터셋에는 Amazon 제품에 대한 고객 리뷰가 1억 5천만 건 이상 포함되어 있습니다. 데이터는 AWS S3에 있는 snappy 압축 Parquet 파일로 제공되며, 전체 크기는 49GB(압축 기준)입니다. 이제 이를 ClickHouse에 삽입하는 단계를 차례대로 살펴보겠습니다.

<Note>
  아래 쿼리는 ClickHouse Cloud의 **Production** 인스턴스에서 실행되었습니다. 자세한 내용은
  ["Playground 사양"](/ko/get-started/sample-datasets/playground#specifications)을 참조하십시오.
</Note>

<div id="loading-the-dataset">
  ## 데이터셋 불러오기
</div>

1. 데이터를 ClickHouse에 삽입하지 않고도 원본 위치에서 바로 쿼리할 수 있습니다. 어떤 모습인지 확인할 수 있도록 몇 개의 행을 가져오겠습니다:

```sql theme={null}
SELECT *
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_2015.snappy.parquet')
LIMIT 3
```

행은 다음과 같습니다:

```response theme={null}
Row 1:
──────
review_date:       16462
marketplace:       US
customer_id:       25444946 -- 2,544만
review_id:         R146L9MMZYG0WA
product_id:        B00NV85102
product_parent:    908181913 -- 9억 818만
product_title:     XIKEZAN iPhone 6 Plus 5.5 inch Waterproof Case, Shockproof Dirtproof Snowproof Full Body Skin Case Protective Cover with Hand Strap & Headphone Adapter & Kickstand
product_category:  Wireless
star_rating:       4
helpful_votes:     0
total_votes:       0
vine:              false
verified_purchase: true
review_headline:   case is sturdy and protects as I want
review_body:       I won't count on the waterproof part (I took off the rubber seals at the bottom because the got on my nerves). But the case is sturdy and protects as I want.

Row 2:
──────
review_date:       16462
marketplace:       US
customer_id:       1974568 -- 197만
review_id:         R2LXDXT293LG1T
product_id:        B00OTFZ23M
product_parent:    951208259 -- 9억 5,121만
product_title:     Season.C Chicago Bulls Marilyn Monroe No.1 Hard Back Case Cover for Samsung Galaxy S5 i9600
product_category:  Wireless
star_rating:       1
helpful_votes:     0
total_votes:       0
vine:              false
verified_purchase: true
review_headline:   One Star
review_body:       Cant use the case because its big for the phone. Waist of money!

Row 3:
──────
review_date:       16462
marketplace:       US
customer_id:       24803564 -- 2,480만
review_id:         R7K9U5OEIRJWR
product_id:        B00LB8C4U4
product_parent:    524588109 -- 5억 2,459만
product_title:     iPhone 5s Case, BUDDIBOX [Shield] Slim Dual Layer Protective Case with Kickstand for Apple iPhone 5 and 5s
product_category:  Wireless
star_rating:       4
helpful_votes:     0
total_votes:       0
vine:              false
verified_purchase: true
review_headline:   but overall this case is pretty sturdy and provides good protection for the phone
review_body:       The front piece was a little difficult to secure to the phone at first, but overall this case is pretty sturdy and provides good protection for the phone, which is what I need. I would buy this case again.
```

2. ClickHouse에 이 데이터를 저장할 `amazon_reviews`라는 이름의 새 `MergeTree` 테이블을 정의하겠습니다:

```sql theme={null}
CREATE DATABASE amazon

CREATE TABLE amazon.amazon_reviews
(
    `review_date` Date,
    `marketplace` LowCardinality(String),
    `customer_id` UInt64,
    `review_id` String,
    `product_id` String,
    `product_parent` UInt64,
    `product_title` String,
    `product_category` LowCardinality(String),
    `star_rating` UInt8,
    `helpful_votes` UInt32,
    `total_votes` UInt32,
    `vine` Bool,
    `verified_purchase` Bool,
    `review_headline` String,
    `review_body` String,
    PROJECTION helpful_votes
    (
        SELECT *
        ORDER BY helpful_votes
    )
)
ENGINE = MergeTree
ORDER BY (review_date, product_category)
```

3. 다음 `INSERT` 명령은 `s3Cluster` 테이블 함수를 사용하며, 이를 통해 클러스터의 모든 노드를 활용해 여러 S3 파일을 병렬로 처리할 수 있습니다. 또한 `https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_*.snappy.parquet`로 시작하는 모든 파일을 삽입하기 위해 와일드카드를 사용합니다:

```sql theme={null}
INSERT INTO amazon.amazon_reviews SELECT *
FROM s3Cluster('default', 
'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_*.snappy.parquet')
```

<Tip>
  ClickHouse Cloud에서는 클러스터 이름이 `default`입니다. `default`를 사용 중인 클러스터 이름으로 바꾸십시오. 클러스터가 없다면 `s3Cluster` 대신 `s3` 테이블 함수를 사용해도 됩니다.
</Tip>

5. 이 쿼리는 오래 걸리지 않으며, 평균적으로 초당 약 300,000행을 처리합니다. 5분 정도 지나면 모든 행이 삽입된 것을 확인할 수 있습니다:

<RunnableCode>
  ```sql theme={null}
  SELECT formatReadableQuantity(count())
  FROM amazon.amazon_reviews
  ```
</RunnableCode>

6. 데이터가 얼마나 많은 공간을 사용하고 있는지 확인해 보겠습니다:

<RunnableCode>
  ```sql theme={null}
  SELECT
      disk_name,
      formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
      formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
      round(usize / size, 2) AS compr_rate,
      sum(rows) AS rows,
      count() AS part_count
  FROM system.parts
  WHERE (active = 1) AND (table = 'amazon_reviews')
  GROUP BY disk_name
  ORDER BY size DESC
  ```
</RunnableCode>

원본 데이터는 약 70G였지만, ClickHouse에서 압축하면 약 30G 정도를 차지합니다.

<div id="example-queries">
  ## 예시 쿼리
</div>

7. 몇 가지 쿼리를 실행해 보겠습니다. 다음은 데이터셋에서 가장 유용하다고 평가된 리뷰 상위 10개입니다:

<RunnableCode>
  ```sql theme={null}
  SELECT
      product_title,
      review_headline
  FROM amazon.amazon_reviews
  ORDER BY helpful_votes DESC
  LIMIT 10
  ```
</RunnableCode>

<Note>
  이 쿼리는 성능 향상을 위해 [프로젝션](/ko/concepts/features/projections/projections)을 사용합니다.
</Note>

8. 다음은 Amazon에서 리뷰 수가 가장 많은 제품 상위 10개입니다:

<RunnableCode>
  ```sql theme={null}
  SELECT
      any(product_title),
      count()
  FROM amazon.amazon_reviews
  GROUP BY product_id
  ORDER BY 2 DESC
  LIMIT 10;
  ```
</RunnableCode>

9. 다음은 각 제품의 월별 평균 리뷰 평점입니다(실제 [Amazon 면접 질문](https://datalemur.com/questions/sql-avg-review-ratings)입니다!):

<RunnableCode>
  ```sql theme={null}
  SELECT
      toStartOfMonth(review_date) AS month,
      any(product_title),
      avg(star_rating) AS avg_stars
  FROM amazon.amazon_reviews
  GROUP BY
      month,
      product_id
  ORDER BY
      month DESC,
      product_id ASC
  LIMIT 20;
  ```
</RunnableCode>

10. 다음은 제품 카테고리별 총 투표 수입니다. `product_category`가 프라이머리 키(primary key)에 포함되어 있으므로 이 쿼리는 빠르게 실행됩니다:

<RunnableCode>
  ```sql theme={null}
  SELECT
      sum(total_votes),
      product_category
  FROM amazon.amazon_reviews
  GROUP BY product_category
  ORDER BY 1 DESC
  ```
</RunnableCode>

11. 이제 리뷰에서 **"awful"** 이라는 단어가 가장 자주 등장하는 제품을 찾아보겠습니다. 이는 상당히 큰 작업입니다. 단어 하나를 찾기 위해 1억 5,100만 개가 넘는 문자열을 파싱해야 합니다:

```sql runnable settings={'enable_parallel_replicas':1} theme={null}
SELECT
    product_id,
    any(product_title),
    avg(star_rating),
    count() AS count
FROM amazon.amazon_reviews
WHERE position(review_body, 'awful') > 0
GROUP BY product_id
ORDER BY count DESC
LIMIT 50;
```

이처럼 많은 데이터에서도 쿼리 시간이 어떻게 나오는지 확인해 보세요. 결과를 읽는 재미도 쏠쏠합니다!

12. 이번에는 리뷰에서 **awesome**을 검색하는 것만 빼고, 동일한 쿼리를 다시 실행할 수 있습니다:

```sql runnable settings={'enable_parallel_replicas':1} theme={null}
SELECT 
    product_id,
    any(product_title),
    avg(star_rating),
    count() AS count
FROM amazon.amazon_reviews
WHERE position(review_body, 'awesome') > 0
GROUP BY product_id
ORDER BY count DESC
LIMIT 50;
```
