Files
ai_english/data/import_primary_words_complete.py
2025-11-17 13:39:05 +08:00

321 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""完整导入小学英语核心词汇到数据库(包含所有字段)"""
import pandas as pd
import mysql.connector
from datetime import datetime
import re
# 数据库配置
db_config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'JKjk20011115',
'database': 'ai_english_learning',
'charset': 'utf8mb4'
}
# 词汇书ID
BOOK_ID = 'primary_core_1000'
def clean_text(text):
"""清理文本处理nan和空值"""
if pd.isna(text) or str(text).strip() == '' or str(text).strip() == 'nan':
return None
return str(text).strip()
def extract_part_of_speech(translation):
"""从中文翻译中提取词性"""
if not translation:
return 'noun'
pos_map = {
'v.': 'verb',
'n.': 'noun',
'adj.': 'adjective',
'adv.': 'adverb',
'prep.': 'preposition',
'conj.': 'conjunction',
'pron.': 'pronoun',
'interj.': 'interjection'
}
for abbr, full in pos_map.items():
if abbr in translation or abbr.replace('.', '') in translation:
return full
# 中文词性判断
if '' in translation:
return 'verb'
elif '' in translation or '' in translation:
return 'adjective'
elif '' in translation:
return 'adverb'
elif '' in translation:
return 'preposition'
elif '' in translation:
return 'conjunction'
return 'noun' # 默认名词
def import_words_from_excel(file_path):
"""从Excel导入单词"""
try:
# 读取Excel文件
print(f"📖 正在读取文件: {file_path}")
df = pd.read_excel(file_path)
print(f"📊 文件列名: {df.columns.tolist()}")
print(f"📊 总行数: {len(df)}")
# 连接数据库
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
# 先清空旧数据
print("\n清理旧数据...")
cursor.execute("DELETE FROM ai_vocabulary_book_words WHERE book_id = %s", (BOOK_ID,))
cursor.execute("""
DELETE v FROM ai_vocabulary v
LEFT JOIN ai_vocabulary_book_words bw ON bw.vocabulary_id = v.id
WHERE bw.id IS NULL
""")
conn.commit()
# 准备SQL语句
insert_vocab_sql = """
INSERT INTO ai_vocabulary
(word, phonetic_us, phonetic_uk, phonetic, level, frequency, is_active,
word_root, synonyms, antonyms, derivatives, collocations, created_at, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
insert_definition_sql = """
INSERT INTO ai_vocabulary_definitions
(vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
VALUES (%s, %s, %s, %s, %s, %s)
"""
insert_example_sql = """
INSERT INTO ai_vocabulary_examples
(vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
VALUES (%s, %s, %s, %s, %s)
"""
insert_book_word_sql = """
INSERT INTO ai_vocabulary_book_words
(book_id, vocabulary_id, sort_order, created_at)
VALUES (%s, %s, %s, %s)
"""
success_count = 0
error_count = 0
# 遍历每一行
for index, row in df.iterrows():
try:
# 提取基本数据
word = clean_text(row.get('Word'))
if not word:
continue
# 音标
phonetic_us = clean_text(row.get('美式音标'))
phonetic_uk = clean_text(row.get('英式音标'))
phonetic = phonetic_us or phonetic_uk
# 释义
translation_cn = clean_text(row.get('中文含义'))
translation_en = clean_text(row.get('英文翻译(对应中文含义)'))
if not translation_cn:
print(f"⚠️ 跳过 {word}:缺少中文含义")
continue
# 如果没有英文翻译,使用单词本身
if not translation_en:
translation_en = word
# 提取词性
part_of_speech = extract_part_of_speech(translation_cn)
# 例句
example_en = clean_text(row.get('例句'))
example_cn = clean_text(row.get('例句中文翻译'))
# 词根
word_root = clean_text(row.get('词根'))
# 同义词处理为JSON
synonyms_text = clean_text(row.get('同义词(含义)'))
synonyms_json = '[]'
if synonyms_text:
# 分号分隔格式word1含义1word2含义2
import json
syn_list = []
for syn in synonyms_text.split(''):
syn = syn.strip()
if syn:
syn_list.append(syn)
synonyms_json = json.dumps(syn_list, ensure_ascii=False)
# 反义词处理为JSON
antonyms_text = clean_text(row.get('反义词(含义)'))
antonyms_json = '[]'
if antonyms_text:
import json
ant_list = []
for ant in antonyms_text.split(''):
ant = ant.strip()
if ant:
ant_list.append(ant)
antonyms_json = json.dumps(ant_list, ensure_ascii=False)
# 派生词处理为JSON
derivatives_text = clean_text(row.get('派生词(含义)'))
derivatives_json = '[]'
if derivatives_text:
import json
der_list = []
for der in derivatives_text.split(''):
der = der.strip()
if der:
der_list.append(der)
derivatives_json = json.dumps(der_list, ensure_ascii=False)
# 词组搭配处理为JSON
phrases_text = clean_text(row.get('词组搭配(中文含义)'))
collocations_json = '[]'
if phrases_text:
import json
col_list = []
for phrase in phrases_text.split(''):
phrase = phrase.strip()
if phrase:
col_list.append(phrase)
collocations_json = json.dumps(col_list, ensure_ascii=False)
# 插入词汇
now = datetime.now()
cursor.execute(insert_vocab_sql, (
word,
phonetic_us,
phonetic_uk,
phonetic,
'beginner',
index + 1,
True,
word_root,
synonyms_json,
antonyms_json,
derivatives_json,
collocations_json,
now,
now
))
vocab_id = cursor.lastrowid
# 插入主要释义
cursor.execute(insert_definition_sql, (
vocab_id,
part_of_speech,
translation_en, # ✅ 使用正确的英文翻译
translation_cn,
0,
now
))
# 插入例句
if example_en and example_cn:
# 处理多个例句(用分号分隔)
examples_en = example_en.split('')
examples_cn = example_cn.split('')
for i, (ex_en, ex_cn) in enumerate(zip(examples_en, examples_cn)):
ex_en = ex_en.strip()
ex_cn = ex_cn.strip()
if ex_en and ex_cn:
cursor.execute(insert_example_sql, (
vocab_id,
ex_en,
ex_cn,
i,
now
))
# 关联到词汇书
cursor.execute(insert_book_word_sql, (
BOOK_ID,
vocab_id,
index,
now
))
success_count += 1
if success_count % 50 == 0:
print(f"✅ 已导入 {success_count} 个单词...")
conn.commit()
except Exception as e:
error_count += 1
print(f"❌ 导入第 {index + 1} 行失败: {e}")
print(f" 单词: {word if 'word' in locals() else 'N/A'}")
# 提交事务
conn.commit()
# 更新词汇书的总单词数
cursor.execute(
"UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
(success_count, BOOK_ID)
)
conn.commit()
print(f"\n🎉 导入完成!")
print(f"✅ 成功: {success_count} 个单词")
print(f"❌ 失败: {error_count} 个单词")
# 验证数据
cursor.execute(
"SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
(BOOK_ID,)
)
count = cursor.fetchone()[0]
print(f"📊 词汇书中共有 {count} 个单词")
# 检查释义数量
cursor.execute("""
SELECT COUNT(DISTINCT d.vocabulary_id)
FROM ai_vocabulary_book_words bw
JOIN ai_vocabulary_definitions d ON d.vocabulary_id = bw.vocabulary_id
WHERE bw.book_id = %s
""", (BOOK_ID,))
def_count = cursor.fetchone()[0]
print(f"📊 有释义的单词: {def_count}")
# 检查例句数量
cursor.execute("""
SELECT COUNT(DISTINCT e.vocabulary_id)
FROM ai_vocabulary_book_words bw
JOIN ai_vocabulary_examples e ON e.vocabulary_id = bw.vocabulary_id
WHERE bw.book_id = %s
""", (BOOK_ID,))
ex_count = cursor.fetchone()[0]
print(f"📊 有例句的单词: {ex_count}")
except Exception as e:
print(f"❌ 导入失败: {e}")
import traceback
traceback.print_exc()
finally:
if cursor:
cursor.close()
if conn:
conn.close()
if __name__ == '__main__':
import_words_from_excel('data/小学.xlsx')