commit 92e122297f1f4ebc7fcf24358e64b7c9d1a1e1cc Author: rrr-marble Date: Sun Mar 6 02:13:53 2022 +0300 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fb6f3e7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.venv/ +src/__pycache__/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fe00269 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# Requirements +### Minimal versions +python 3.7 +postgresql 11 + +### Database schema +includes GIN index column `geodata_search_idx` +```sql +ALTER TABLE geodata ADD COLUMN geodata_search_ts TSVECTOR +GENERATED ALWAYS AS (to_tsvector('russian', geodata.description) STORED +``` +(use coalese() if multiple) +```sql +CREATE INDEX geodata_search_idx ON geodata USING GIN (geodata_search_ts); +``` diff --git a/geodata-catalog.code-workspace b/geodata-catalog.code-workspace new file mode 100644 index 0000000..a07691a --- /dev/null +++ b/geodata-catalog.code-workspace @@ -0,0 +1,15 @@ +{ + "folders": [ + { + "path": "." + } + ], + "settings": { + "python.defaultInterpreterPath": ".venv/bin/python", + "python.formatting.blackPath": "/usr/bin/black", + "python.linting.flake8Path": "/usr/bin/flake8", + "python.linting.flake8Args": [ + "--black-config=''" + ] + } +} \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..13cac2b --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,18 @@ +anyio==3.5.0 +asgiref==3.5.0 +click==8.0.4 +et-xmlfile==1.1.0 +fastapi==0.74.1 +greenlet==1.1.2 +h11==0.13.0 +idna==3.3 +openpyxl==3.0.9 +psycopg2==2.9.3 +pydantic==1.9.0 +python-multipart==0.0.5 +six==1.16.0 +sniffio==1.2.0 +SQLAlchemy==1.4.31 +starlette==0.17.1 +typing_extensions==4.1.1 +uvicorn==0.17.5 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f7ac231 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +anyio==3.5.0 +et-xmlfile==1.1.0 +fastapi==0.74.1 +greenlet==1.1.2 +idna==3.3 +openpyxl==3.0.9 +psycopg2==2.9.3 +pydantic==1.9.0 +python-multipart==0.0.5 +six==1.16.0 +sniffio==1.2.0 +SQLAlchemy==1.4.31 +starlette==0.17.1 +typing_extensions==4.1.1 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/crud.py b/src/crud.py new file mode 100644 index 0000000..fdd971a --- /dev/null +++ b/src/crud.py @@ -0,0 +1,62 @@ +from typing import List + +from sqlalchemy.orm import Session +from sqlalchemy.sql.expression import func +from sqlalchemy import inspect + +from . import models, schemas +from .database import Base + + +def get_item(db: Session, item_id: int): + """индивидуальные страницы для каждого Описания набора данных""" + return db.query(models.Item).filter(models.Item.id == item_id).first() + + +def get_item_by_description(db: Session, needle: str, skip: int = 0, limit: int = 20): + """фильтры для поиска Описаний наборов данных по атрибутам + дополнительный возможный синтаксис в запросах преобразуется + в поисковый запрос содержащий: + - простой текст: переданные слова в любом порядке + - "текст в кавычках": переданные слова в указанном порядке + - OR: какие-либо из переданных слов + - -: не содержащий данного слова + """ + # hardcode russian for now + # built-in postgress websearch_to_tsquery() is good + # https://www.postgresql.org/docs/11/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES + # TODO: add requested search columns [0] + items_table = inspect(models.Item).local_table + result = ( + db.query(models.Item) + .filter( + items_table.c.ts.op("@@")(func.websearch_to_tsquery('"russian"', needle)) + ) + .order_by(items_table.c.id) + .offset(skip) + .limit(limit) + .all() + ) + return result + + +def get_items(db: Session, skip: int = 0, limit: int = 20): + """список доступных в системе Описаний наборов данных""" + return db.query(models.Item).offset(skip).limit(limit).all() + + +def insert_items(db: Session, items: List[schemas.ItemCreate]): + """импорт в базу данных Описаний наборов данных""" + before = db.query(models.Item).count() + accepted = len(items) + # https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-fast-execution-helpers + modeled_items = (models.ItemCreate(**item.dict()) for item in items) + db.add_all(modeled_items) + db.commit() + inserted = db.query(models.Item).count() - before + return (accepted, inserted) + + +def get_headers(db: Session): + """полные заголовки таблиц""" + return db.query(models.Header).all() diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..953a57d --- /dev/null +++ b/src/database.py @@ -0,0 +1,18 @@ +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker + +# Database connection parameters +SQALCHEMY_DATABASE_URL = "postgresql+psycopg2://geodata:PASSWORD@postgres/geodata" + + +engine = create_engine( + SQALCHEMY_DATABASE_URL, + # https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-connect-arguments + executemany_mode="values_only", + executemany_values_page_size=10000, + executemany_batch_page_size=500, +) +SessionLocal = sessionmaker(autocommit=False, bind=engine) + +Base = declarative_base() diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..ac75ff2 --- /dev/null +++ b/src/main.py @@ -0,0 +1,139 @@ +from datetime import datetime +from re import IGNORECASE, sub as substitute +from typing import List +from urllib.parse import unquote_plus + +from fastapi import Depends, FastAPI, File, HTTPException, UploadFile +from sqlalchemy.orm import Session + +from . import crud, models, schemas, spreadsheet +from .database import SessionLocal, engine + + +# TODO: switch for Alembic [2] +models.Base.metadata.create_all(bind=engine) + +app = FastAPI() + +# Dependency +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() + + +@app.post( + "/items/", + responses={ + 422: { + "description": "Загружен некорректный файл (ожидался .xlsx). " + "Неизвестный заголовок. Или обнаружены данные в столбце без заголовка. " + "Необходимо использовать файлы установленного образца." + }, + }, + response_model=schemas.InsertStatus, +) +def create_items(file: UploadFile = File(...), db: Session = Depends(get_db)): + """импорт в базу данных Описаний наборов данных из Excel файлов""" + try: + start = datetime.now() + # parse spreadsheet into a collection of header and data iterators + # TODO: Multiple worksheets per Workbook or one? [3] + # assume one for now (second one have strange formatting) + with spreadsheet.parse(file=file) as spreadsheet_parse: + sheet = spreadsheet_parse[0] + # TODO use fullfledged validation framework? [6] + # ad-hoc for now: + # fetch known headersfrom the database + # NB: we never check the actual order of the columns, so we might suffer for it + + # we have to construct a collection of known headers anyway, + # might as well make it a set + unknown_headers = set(sheet["header"]) - set( + dbh.spreadsheet for dbh in crud.get_headers(db=db) + ) + if unknown_headers: + raise HTTPException( + status_code=422, + detail="Unknown headers in the spreadsheet: {}. Check the coordinated format".format( + unknown_headers + ), + ) + + # construct a list of schemas.Item items + spreadsheet_item_list = [ + schemas.ItemCreate( + **{ + key: row[i] + for i, key in enumerate(schemas.ItemCreate.__fields__.keys()) + } + ) + for row in sheet["data"] + ] + # dump all the data into database + accepted, processed = crud.insert_items(db=db, items=spreadsheet_item_list) + + except spreadsheet.InvalidFileException: + raise HTTPException( + status_code=422, detail="Invalid file upload (expected .xlsx)" + ) + except spreadsheet.DataInUnnamedColumnException: + raise HTTPException( + status_code=422, detail="Data is found in a column with empty header" + ) + + return schemas.InsertStatus( + status="Success" if accepted == processed else "Warning", + accepted=accepted, + processed=processed, + process_time=datetime.now() - start, + ) + + +@app.get( + "/item/{item_id}", + response_model=schemas.Item, + responses={404: {"description": "Такой набор данных отсутствует"}}, +) +def read_item(item_id: int, db: Session = Depends(get_db)): + """индивидуальные страницы для каждого Описания набора данных""" + db_item = crud.get_item(db=db, item_id=item_id) + if db_item is None: + raise HTTPException(status_code=404, detail="Item not found") + return db_item + + +@app.get("/items/", response_model=List[schemas.Item]) +def read_items(skip: int = 0, limit: int = 20, db: Session = Depends(get_db)): + """список доступных в системе Описаний наборов данных""" + return crud.get_items(db=db, skip=skip, limit=limit) + + +@app.get( + "/search/", + response_model=List[schemas.Item], + responses={400: {"description": "Запрос слишком короткий (минимумл 3 символа)"}}, +) +def search(q: str, skip: int = 0, limit: int = 20, db: Session = Depends(get_db)): + """фильтры для поиска Описаний наборов данных по атрибутам + дополнительный возможный синтаксис в запросах преобразуется + в поисковый запрос содержащий: + - простой текст: переданные слова в любом порядке + - "текст в кавычках": переданные слова в указанном порядке + - OR ИЛИ: какие-либо из переданных слов + - -: не содержащий данного слова + """ + q = unquote_plus(q) + # replace all full ИЛИ words with OR + q = substitute(r"\bИЛИ\b", "OR", q, flags=IGNORECASE) + if len(q) < 3: + raise HTTPException(status_code=400, detail="Query too short") + return crud.get_item_by_description(db=db, needle=q, skip=skip, limit=limit) + + +@app.get("/headers/", response_model=List[schemas.Header]) +def read_headers(db: Session = Depends(get_db)): + """полные наименования столбцов таблиц""" + return crud.get_headers(db=db) diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..cf6428e --- /dev/null +++ b/src/models.py @@ -0,0 +1,50 @@ +from sqlalchemy import Column, Integer, String, DateTime + +# we might need vector concat later, then we'll have to bring in sqlalchemy_utils +# https://sqlalchemy-utils.readthedocs.io/en/latest/_modules/sqlalchemy_utils/types/ts_vector.html +from sqlalchemy.dialects.postgresql import TSVECTOR + +from .database import Base + + +class Header(Base): + __tablename__ = "headers" + + database = Column(String, primary_key=True) + spreadsheet = Column(String) + + +class ItemBase(Base): + __tablename__ = "geodata" + + id = Column(Integer, primary_key=True, index=True) + fadr = Column(String) + oldid = Column(String) + category = Column(String) + basin = Column(String) + deposit = Column(String) + well = Column(String) + depth = Column(String) + stratum = Column(String) + owner = Column(String) + org = Column(String) + ownercontacts = Column(String) + samplelist = Column(String) + description = Column(String) + formdimentions = Column(String) + datalist = Column(String) + resolution = Column(String) + date = Column(String) + additionalinfo = Column(String) + scanner = Column(String) + comment = Column(String) + continuation = Column(String) + + +class ItemCreate(ItemBase): + pass + + +class Item(ItemBase): + + geodata_search_ts = Column(TSVECTOR) diff --git a/src/schemas.py b/src/schemas.py new file mode 100644 index 0000000..f4a13b3 --- /dev/null +++ b/src/schemas.py @@ -0,0 +1,69 @@ +from datetime import datetime, timedelta +from typing import Optional, Union + +from pydantic import BaseModel + + +class HeaderBase(BaseModel): + database: str + spreadsheet: str + + +class HeaderCreate(HeaderBase): + pass + + +class Header(HeaderBase): + """Заголовок описания набора геологических данных + по результатам экспедиционных исследований""" + + class Config: + orm_mode = True + + +class ItemBase(BaseModel): + """Описание набора геологических данных + по результатам экспедиционных исследований + """ + + fadr: Optional[str] = None + oldid: Optional[str] = None + category: Optional[str] = None + basin: Optional[str] = None + deposit: Optional[str] = None + well: Optional[str] = None + depth: Optional[str] = None + stratum: Optional[str] = None + owner: Optional[str] = None + org: Optional[str] = None + ownercontacts: Optional[str] = None + samplelist: Optional[str] = None + description: Optional[str] = None + formdimentions: Optional[str] = None + datalist: Optional[str] = None + resolution: Optional[str] = None + date: Optional[Union[datetime, str]] = None + additionalinfo: Optional[str] = None + scanner: Optional[str] = None + comment: Optional[str] = None + continuation: Optional[str] = None + + +class ItemCreate(ItemBase): + pass + + +class Item(ItemBase): + id: int + + class Config: + orm_mode = True + + +class InsertStatus(BaseModel): + """basic insertation status metrics""" + + status: str + accepted: int + processed: int + process_time: timedelta diff --git a/src/spreadsheet.py b/src/spreadsheet.py new file mode 100644 index 0000000..d445de0 --- /dev/null +++ b/src/spreadsheet.py @@ -0,0 +1,103 @@ +from openpyxl import Workbook, load_workbook +from openpyxl.utils.exceptions import InvalidFileException +from fastapi import File, UploadFile + +from asyncio import run as asyncio_run +from contextlib import closing, contextmanager +from itertools import chain +from os import remove +from pathlib import Path +from shutil import copyfileobj +from tempfile import NamedTemporaryFile + + +class DataInUnnamedColumnException(Exception): + """all the columns containing any data have to me named""" + + pass + + +@contextmanager +def parse(file: UploadFile = File(...)): + """returns a dict with a pair of iterators + for each sheet in the spreadsheet in a list + + [ + {"header": header_iterator,"data": data_iterator}, # sheet1 + {"header": header_iterator,"data": data_iterator}, # sheet2 + # etc + ] + """ + # prepare return list + result = [] + + suffix = Path(file.filename).suffix + try: + # TODO: decide if we use subdir in /tmp here, then create it [5] + with NamedTemporaryFile(delete=False, suffix=suffix, dir="/tmp/notgeo") as tmp: + copyfileobj(file.file, tmp) + spreadsheet_file = Path(tmp.name) + + # Unlike a normal workbook, a read-only workbook will use lazy loading. + # The workbook must be explicitly closed with the close() method. + + # we use contextlib.closing() to do it for us + with closing(load_workbook(spreadsheet_file, read_only=True)) as wb: + # TODO: Multiple worksheets per Workbook or one? [3] + # assume one for now (second one have strange formatting) + ws = wb.active + # assume headers are in the top row + header = [ + cell + for row in ws.iter_rows(max_row=1, values_only=True) + for cell in row + ] + + # assume data stretch is continuous + # find first occurence of None in header and assert that + # no unnamed column contains any data + # .index is 0-based and min_col= is 1-based so we want to + # go for header.index(None)+1 + # unpack row generator into separate cell iterators, chain them together, + # and make sure none of the cells contains any data + try: + last_column_with_header = header.index(None) + if any( + chain( + *( + ws.iter_rows( + min_col=last_column_with_header + 1, values_only=True + ) + ) + ) + ): + raise DataInUnnamedColumnException( + "Data is found in a column with empty header" + ) + except ValueError: + # header.index(None) couldn't find anything, + # all cell in the first row are filled, + # so all columns have headers, + # we can safely continue + last_column_with_header = len(header) + + # Construct spreadsheet data iterator + # .index is 0-based and max_col= is 1-based so we might've wanted to + # go for header.index(None)+1, but max_col= range is including + # and we want to only include non-empty columns, and go for + # the previous one (-1) so in the end + # max_col=header.index(None)+1-1 + data = ws.iter_rows( + min_row=2, max_col=last_column_with_header, values_only=True + ) + result.append( + {"header": (cell for cell in header if cell is not None), "data": data} + ) + # END [3] + + yield result + finally: + # clean up by explicitly closing the files and removing temporary spreadsheet + asyncio_run(file.close()) + tmp.close() + remove(spreadsheet_file)